scanner: add maildir-scan mode; improve portability

Use d_ino (struct dirent) only when available.

Implement a mode for scanning just maildirs (ie. the dirs with cur / new
in them). Use d_type (if available) to optimize that.
This commit is contained in:
Dirk-Jan C. Binnema 2023-08-12 15:59:32 +03:00
parent 8caf504381
commit f5beea2eb2
4 changed files with 197 additions and 64 deletions

View File

@ -37,8 +37,19 @@ lib_mu_index_dep = declare_dependency(
link_with: lib_mu_index link_with: lib_mu_index
) )
# #
# tests # test tool
#
executable('list-maildirs', 'mu-scanner.cc',
install: false,
cpp_args: ['-DBUILD_LIST_MAILDIRS'],
dependencies: [glib_dep, config_h_dep,
lib_mu_utils_dep])
#
# unit tests
# #
test('test-scanner', test('test-scanner',

View File

@ -38,9 +38,35 @@
using namespace Mu; using namespace Mu;
using Mode = Scanner::Mode;
/*
* dentry->d_ino, dentry->d_type may not be available
*/
struct dentry_t {
dentry_t(const struct dirent *dentry):
#if HAVE_DIRENT_D_INO
d_ino{dentry->d_ino},
#endif /*HAVE_DIRENT_D_INO*/
#if HAVE_DIRENT_D_TYPE
d_type(dentry->d_type),
#endif /*HAVE_DIRENT_D_TYPE*/
d_name{static_cast<const char*>(dentry->d_name)} {}
#if HAVE_DIRENT_D_INO
ino_t d_ino;
#endif /*HAVE_DIRENT_D_INO*/
#if HAVE_DIRENT_D_TYPE
unsigned char d_type;
#endif /*HAVE_DIRENT_D_TYPE*/
std::string d_name;
};
struct Scanner::Private { struct Scanner::Private {
Private(const std::string& root_dir, Scanner::Handler handler): Private(const std::string& root_dir, Scanner::Handler handler, Mode mode):
root_dir_{root_dir}, handler_{handler} { root_dir_{root_dir}, handler_{handler}, mode_{mode} {
if (root_dir_.length() > PATH_MAX) if (root_dir_.length() > PATH_MAX)
throw Mu::Error{Error::Code::InvalidArgument, throw Mu::Error{Error::Code::InvalidArgument,
"path is too long"}; "path is too long"};
@ -53,38 +79,35 @@ struct Scanner::Private {
Result<void> start(); Result<void> start();
void stop(); void stop();
struct dentry_t {
dentry_t(const struct dirent *dentry):
d_ino{dentry->d_ino},
d_name{static_cast<const char*>(dentry->d_name)} {}
ino_t d_ino;
std::string d_name;
};
bool process_dentry(const std::string& path, const dentry_t& dentry, bool process_dentry(const std::string& path, const dentry_t& dentry,
bool is_maildir); bool is_maildir);
bool process_dir(const std::string& path, bool is_maildir); bool process_dir(const std::string& path, bool is_maildir);
const std::string root_dir_; int lazy_stat(const char *fullpath, struct stat *stat_buf,
const Scanner::Handler handler_; const dentry_t& dentry);
std::atomic<bool> running_{};
std::mutex lock_; bool maildirs_only_mode() const { return mode_ == Mode::MaildirsOnly; }
const std::string root_dir_;
const Scanner::Handler handler_;
Mode mode_;
std::atomic<bool> running_{};
std::mutex lock_;
}; };
static bool static bool
is_dotdir(const char *d_name) ignore_dentry(const dentry_t& dentry)
{ {
const auto d_name{dentry.d_name.c_str()};
/* dotdir? */ /* dotdir? */
if (d_name[0] == '\0' || (d_name[1] == '\0' && d_name[0] == '.') || if (d_name[0] == '\0' || (d_name[1] == '\0' && d_name[0] == '.') ||
(d_name[2] == '\0' && d_name[0] == '.' && d_name[1] == '.')) (d_name[2] == '\0' && d_name[0] == '.' && d_name[1] == '.'))
return true; return true;
return false; if (g_strcmp0(d_name, "tmp") == 0)
} return true;
static bool
do_ignore(const char *d_name)
{
if (d_name[0] == '.') { if (d_name[0] == '.') {
if (d_name[1] == '#') /* emacs? */ if (d_name[1] == '#') /* emacs? */
return true; return true;
@ -97,45 +120,78 @@ do_ignore(const char *d_name)
if (g_strcmp0(d_name, "hcache.db") == 0) /* mutt cache? */ if (g_strcmp0(d_name, "hcache.db") == 0) /* mutt cache? */
return true; return true;
return false; return false; /* don't ignore */
} }
/*
* stat() if necessary (we'd like to avoid it), which we can if we only need the
* file-type and we already have that from the dentry.
*/
int
Scanner::Private::lazy_stat(const char *path, struct stat *stat_buf, const dentry_t& dentry)
{
#if HAVE_DIRENT_D_TYPE
if (maildirs_only_mode()) {
switch (dentry.d_type) {
case DT_REG:
stat_buf->st_mode = S_IFREG;
return 0;
case DT_DIR:
stat_buf->st_mode = S_IFDIR;
return 0;
default:
/* LNK is inconclusive; we need a stat. */
break;
}
}
#endif /*HAVE_DIRENT_D_TYPE*/
int res = ::stat(path, stat_buf);
if (res != 0)
mu_warning("failed to stat {}: {}", path, g_strerror(errno));
return res;
}
bool bool
Scanner::Private::process_dentry(const std::string& path, const dentry_t& dentry, Scanner::Private::process_dentry(const std::string& path, const dentry_t& dentry,
bool is_maildir) bool is_maildir)
{ {
const auto d_name{dentry.d_name.c_str()}; if (ignore_dentry(dentry))
return true;
if (is_dotdir(d_name) || std::strcmp(d_name, "tmp") == 0) auto call_handler=[&](auto&& path, auto&& statbuf, auto&& htype)->bool {
return true; // ignore. return maildirs_only_mode() ? true : handler_(path, statbuf, htype);
if (do_ignore(d_name)) { };
mu_debug("skip {}/{} (ignore)", path, d_name);
return true; // ignore
}
const auto fullpath{join_paths(path, d_name)}; const auto fullpath{join_paths(path, dentry.d_name)};
struct stat statbuf {}; struct stat statbuf{};
if (::stat(fullpath.c_str(), &statbuf) != 0) { if (lazy_stat(fullpath.c_str(), &statbuf, dentry) != 0)
mu_warning("failed to stat {}: {}", fullpath, g_strerror(errno));
return false; return false;
if (maildirs_only_mode() && S_ISDIR(statbuf.st_mode) && dentry.d_name == "cur") {
handler_(path/*without cur*/, {}, Scanner::HandleType::Maildir);
return true; // found maildir; no need to recurse further.
} }
if (S_ISDIR(statbuf.st_mode)) { if (S_ISDIR(statbuf.st_mode)) {
const auto new_cur = const auto new_cur = dentry.d_name == "cur" || dentry.d_name == "new";
std::strcmp(d_name, "cur") == 0 || std::strcmp(d_name, "new") == 0;
const auto htype = const auto htype =
new_cur ? new_cur ?
Scanner::HandleType::EnterNewCur : Scanner::HandleType::EnterNewCur :
Scanner::HandleType::EnterDir; Scanner::HandleType::EnterDir;
const auto res = handler_(fullpath, &statbuf, htype);
const auto res = call_handler(fullpath, &statbuf, htype);
if (!res) if (!res)
return true; // skip return true; // skip
process_dir(fullpath, new_cur); process_dir(fullpath, new_cur);
return handler_(fullpath, &statbuf, Scanner::HandleType::LeaveDir); return call_handler(fullpath, &statbuf, Scanner::HandleType::LeaveDir);
} else if (S_ISREG(statbuf.st_mode) && is_maildir) } else if (S_ISREG(statbuf.st_mode) && is_maildir)
return handler_(fullpath, &statbuf, Scanner::HandleType::File); return call_handler(fullpath, &statbuf, Scanner::HandleType::File);
mu_debug("skip {} (neither maildir-file nor directory)", fullpath); mu_debug("skip {} (neither maildir-file nor directory)", fullpath);
@ -165,6 +221,11 @@ Scanner::Private::process_dir(const std::string& path, bool is_maildir)
while (running_) { while (running_) {
errno = 0; errno = 0;
if (const auto& dentry{::readdir(dir)}; dentry) { if (const auto& dentry{::readdir(dir)}; dentry) {
#if HAVE_DIRENT_D_TYPE /* opttimization: filter out non-dirs early */
if (maildirs_only_mode() &&
dentry->d_type != DT_DIR && dentry->d_type != DT_LNK)
continue;
#endif /*HAVE_DIRENT_D_TYPE*/
dir_entries.emplace_back(dentry); dir_entries.emplace_back(dentry);
continue; continue;
} else if (errno != 0) { } else if (errno != 0) {
@ -176,10 +237,12 @@ Scanner::Private::process_dir(const std::string& path, bool is_maildir)
} }
::closedir(dir); ::closedir(dir);
#if HAVE_DIRENT_D_INO
// sort by i-node; much faster on rotational (HDDs) devices and on SSDs // sort by i-node; much faster on rotational (HDDs) devices and on SSDs
// sort is quick enough to not matter much // sort is quick enough to not matter much
std::sort(dir_entries.begin(), dir_entries.end(), std::sort(dir_entries.begin(), dir_entries.end(),
[](auto&& d1, auto&& d2){ return d1.d_ino < d2.d_ino; }); [](auto&& d1, auto&& d2){ return d1.d_ino < d2.d_ino; });
#endif /*HAVEN_DIRENT_D_INO*/
// now process... // now process...
for (auto&& dentry: dir_entries) for (auto&& dentry: dir_entries)
@ -231,8 +294,8 @@ Scanner::Private::stop()
} }
} }
Scanner::Scanner(const std::string& root_dir, Scanner::Handler handler) Scanner::Scanner(const std::string& root_dir, Scanner::Handler handler, Mode flavor)
: priv_{std::make_unique<Private>(root_dir, handler)} : priv_{std::make_unique<Private>(root_dir, handler, flavor)}
{ {
} }
@ -264,12 +327,9 @@ Scanner::is_running() const
} }
#if BUILD_TESTS #if BUILD_TESTS
#include "mu-test-utils.hh" #include "mu-test-utils.hh"
static void static void
test_scan_maildir() test_scan_maildir()
{ {
@ -307,6 +367,29 @@ try {
mu_printerrln("caught exception"); mu_printerrln("caught exception");
return 1; return 1;
} }
#endif /*BUILD_TESTS*/ #endif /*BUILD_TESTS*/
#if BUILD_LIST_MAILDIRS
static bool
on_path(const std::string& path, struct stat* statbuf, Scanner::HandleType htype)
{
mu_println("{}", path);
return true;
}
int
main (int argc, char *argv[])
{
if (argc < 2) {
mu_printerrln("expected: path to maildir");
return 1;
}
Scanner scanner{argv[1], on_path, Mode::MaildirsOnly};
scanner.start();
return 0;
}
#endif /*BUILD_LIST_MAILDIRS*/

View File

@ -31,28 +31,51 @@
namespace Mu { namespace Mu {
/// @brief Maildir scanner /**
/// * @brief Maildir scanner
/// Scans maildir (trees) recursively, and calls the Handler callback for *
/// directories & files. * Scans maildir (trees) recursively, and calls the Handler callback for
/// * directories & files.
/// It filters out (i.e., does *not* call the handler for): *
/// - files starting with '.' * It filters out (i.e., does *not* call the handler for):
/// - files that do not live in a cur / new leaf maildir * - files starting with '.'
/// - directories '.' and '..' and 'tmp' * - files that do not live in a cur / new leaf maildir
/// * - directories '.' and '..' and 'tmp'
*/
class Scanner { class Scanner {
public: public:
enum struct HandleType { enum struct HandleType {
/*
* Mode: All
*/
File, File,
EnterNewCur, /* cur/ or new/ */ EnterNewCur, /* cur/ or new/ */
EnterDir, /* some other directory */ EnterDir, /* some other directory */
LeaveDir LeaveDir,
/*
* Mode: Maildir
*/
Maildir,
}; };
/// Prototype for a handler function /**
* Callback handler function
*
* path: full file-system path
* statbuf: stat result or nullptr (for Mode::MaildirsOnly)
* htype: HandleType. For Mode::MaildirsOnly only Maildir
*/
using Handler = std::function< using Handler = std::function<
bool(const std::string& fullpath, struct stat* statbuf, HandleType htype)>; bool(const std::string& path, struct stat* statbuf, HandleType htype)>;
/**
* Running mode for this Scanner
*/
enum struct Mode {
All, /**< Vanilla */
MaildirsOnly /**< Only return maildir to handler */
};
/** /**
* Construct a scanner object for scanning a directory, recursively. * Construct a scanner object for scanning a directory, recursively.
* *
@ -60,15 +83,16 @@ class Scanner {
* *
* @param root_dir root dir to start scanning * @param root_dir root dir to start scanning
* @param handler handler function for some direntry * @param handler handler function for some direntry
* @param options options to influence behavior
*/ */
Scanner(const std::string& root_dir, Handler handler); Scanner(const std::string& root_dir, Handler handler, Mode mode = Mode::All);
/** /**
* DTOR * DTOR
*/ */
~Scanner(); ~Scanner();
/** /**#
* Start the scan; this is a blocking call than runs until * Start the scan; this is a blocking call than runs until
* finished or (from another thread) stop() is called. * finished or (from another thread) stop() is called.
* *

View File

@ -47,20 +47,21 @@ endif
# compilers / flags # compilers / flags
# #
extra_flags = [ extra_flags = [
'-Wc11-extensions', # for clang
'-Wno-unused-parameter', '-Wno-unused-parameter',
'-Wno-cast-function-type', '-Wno-cast-function-type',
'-Wformat-security', '-Wformat-security',
'-Wformat=2', '-Wformat=2',
'-Wstack-protector', '-Wstack-protector',
'-Wno-switch-enum', '-Wno-switch-enum',
'-Wno-keyword-macro',
'-Wno-volatile', '-Wno-volatile',
'-Wno-deprecated-volatile',
'-Wno-#warnings',
# assuming these are false alarm... (in fmt, with gcc13): # assuming these are false alarm... (in fmt, with gcc13):
'-Wno-array-bounds', '-Wno-array-bounds',
'-Wno-stringop-overflow', '-Wno-stringop-overflow',
# clang
'-Wc11-extensions', # for clang
'-Wno-keyword-macro',
'-Wno-deprecated-volatile',
'-Wno-#warnings',
] ]
if get_option('buildtype') == 'debug' if get_option('buildtype') == 'debug'
@ -104,6 +105,20 @@ add_project_arguments(['-DHAVE_CONFIG_H'], language: 'cpp')
config_h_dep=declare_dependency( config_h_dep=declare_dependency(
include_directories: include_directories(['.'])) include_directories: include_directories(['.']))
#
# d_type, d_ino are not available universally, so let's check
# (we use them for optimizations in mu-scanner
#
if cxx.has_member('struct dirent', 'd_ino', prefix : '#include<dirent.h>')
config_h_data.set('HAVE_DIRENT_D_INO', 1)
endif
if cxx.has_member('struct dirent', 'd_type', prefix : '#include<dirent.h>')
config_h_data.set('HAVE_DIRENT_D_TYPE', 1)
endif
functions=[ functions=[
'setsid' 'setsid'
] ]