diff --git a/man/mu-index.1 b/man/mu-index.1 index cc1e45e1..1a00bfb6 100644 --- a/man/mu-index.1 +++ b/man/mu-index.1 @@ -90,18 +90,22 @@ set the maximum number of messages to process in a single Xapian transaction. In practice, this option is only useful if you find that \fBmu\fR is running out of memory while indexing; in that case, you can set the batch size to (for example) 1000, which will reduce memory consumption, but also -reduce performance. +substantially reduce the indexing performance. + +.TP +\fB\-\-max-msg-size\fR=\fI\fR +set the maximum size (in bytes) for messages. The default maximum (currently +at 50Mb) should be enough in most cases, but if you encounter warnings from +\fBmu\fR about ignoring messsage because they are too big, you may want to +increase this. Note that the reason for having a maximum size is that big +message require big memory allocations, which may lead to problems. .B NOTE: -It is generally not a good idea to run multiple instances of \fBmu index\fR -concurrently. No data loss should occur, but one or more of the instances may -experience errors due to database locks. - -Furthermore, it is not recommended tot mix maildirs and sub-maildirs within -the hierarchy in the same database; for example, it's better not to index both -with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as -this may lead to unexpected results when searching with the the 'maildir:' -search parameter (see below). +It is not recommended tot mix maildirs and sub-maildirs within the hierarchy +in the same database; for example, it's better not to index both with +\fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as this +may lead to unexpected results when searching with the the 'maildir:' search +parameter (see below). .SS A note on performance As a non-scientific benchmark, a simple test on the authors machine (a diff --git a/src/mu-cmd-index.c b/src/mu-cmd-index.c index 4c2d9496..99c2dff3 100644 --- a/src/mu-cmd-index.c +++ b/src/mu-cmd-index.c @@ -94,7 +94,12 @@ check_index_or_cleanup_params (MuConfig *opts) g_warning ("the Xapian batch size must be non-negative"); return FALSE; } - + + if (opts->max_msg_size < 0) { + g_warning ("the maximum message size must be non-negative"); + return FALSE; + } + return TRUE; } @@ -346,10 +351,12 @@ cmd_index_or_cleanup (MuConfig *opts) return MU_EXITCODE_ERROR; err = NULL; - if (!(midx = mu_index_new - (mu_runtime_xapian_dir(), opts->xbatchsize, &err))) + if (!(midx = mu_index_new (mu_runtime_xapian_dir(), &err))) return handle_index_error_and_free (err); - + + mu_index_set_max_msg_size (midx, opts->max_msg_size); + mu_index_set_xbatch_size (midx, opts->xbatchsize); + /* we determine the maildir path only here, as it may depend on * mu_index_last_used_maildir */ diff --git a/src/mu-config.c b/src/mu-config.c index 661b947d..96d20b8a 100644 --- a/src/mu-config.c +++ b/src/mu-config.c @@ -96,6 +96,8 @@ config_options_group_index (MuConfig * opts) "don't clean up the database after indexing (false)", NULL}, {"xbatchsize", 0, 0, G_OPTION_ARG_INT, &opts->xbatchsize, "set transaction batchsize for xapian commits (0)", NULL}, + {"max-msg-size", 0, 0, G_OPTION_ARG_INT, &opts->max_msg_size, + "set the maximum size for message files", NULL}, {NULL, 0, 0, 0, NULL, NULL, NULL} }; diff --git a/src/mu-config.h b/src/mu-config.h index 3d6c1cff..0d7d17dc 100644 --- a/src/mu-config.h +++ b/src/mu-config.h @@ -72,9 +72,10 @@ struct _MuConfig { gboolean rebuild; /* empty the database before indexing */ gboolean autoupgrade; /* automatically upgrade db * when needed */ - int xbatchsize; /* batchsize for xapian + int xbatchsize; /* batchsize for xapian * commits, or 0 for * default */ + int max_msg_size; /* maximum size for message files */ /* options for querying */ gboolean xquery; /* (obsolete) give the Xapian diff --git a/src/mu-index.c b/src/mu-index.c index a93b1a98..34e3562e 100644 --- a/src/mu-index.c +++ b/src/mu-index.c @@ -35,16 +35,17 @@ #include "mu-util.h" #define MU_LAST_USED_MAILDIR_KEY "last_used_maildir" -#define MU_MAILDIR_WALK_MAX_FILE_SIZE (64*1000*1000) +#define MU_INDEX_MAX_FILE_SIZE (50*1000*1000) /* 50 Mb */ struct _MuIndex { MuStore *_store; gboolean _needs_reindex; gchar *_last_used_maildir; + guint _max_filesize; }; MuIndex* -mu_index_new (const char *xpath, guint xbatchsize, GError **err) +mu_index_new (const char *xpath, GError **err) { MuIndex *index; @@ -52,7 +53,7 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err) index = g_new0 (MuIndex, 1); - index->_store = mu_store_new (xpath, xbatchsize, err); + index->_store = mu_store_new (xpath, err); if (!index->_store) { g_warning ("%s: failed to open xapian store (%s)", __FUNCTION__, xpath); @@ -60,6 +61,9 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err) return NULL; } + /* set the default max file size */ + index->_max_filesize = MU_INDEX_MAX_FILE_SIZE; + /* see we need to reindex the database; note, there is a small * race-condition here, between mu_index_new and * mu_index_run. Maybe do the check in mu_index_run @@ -73,7 +77,6 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err) return index; } - void mu_index_destroy (MuIndex *index) { @@ -88,13 +91,14 @@ mu_index_destroy (MuIndex *index) struct _MuIndexCallbackData { - MuIndexMsgCallback _idx_msg_cb; - MuIndexDirCallback _idx_dir_cb; - MuStore* _store; - void* _user_data; - MuIndexStats* _stats; - gboolean _reindex; - time_t _dirstamp; + MuIndexMsgCallback _idx_msg_cb; + MuIndexDirCallback _idx_dir_cb; + MuStore* _store; + void* _user_data; + MuIndexStats* _stats; + gboolean _reindex; + time_t _dirstamp; + guint _max_filesize; }; typedef struct _MuIndexCallbackData MuIndexCallbackData; @@ -181,9 +185,9 @@ on_run_maildir_msg (const char* fullpath, const char* mdir, gboolean updated; /* protect against too big messages */ - if (G_UNLIKELY(statbuf->st_size > MU_MAILDIR_WALK_MAX_FILE_SIZE)) { - g_warning ("ignoring because bigger than %d bytes: %s", - MU_MAILDIR_WALK_MAX_FILE_SIZE, fullpath); + if (G_UNLIKELY(statbuf->st_size > data->_max_filesize)) { + g_warning ("ignoring because bigger than %u bytes: %s", + data->_max_filesize, fullpath); return MU_OK; /* not an error */ } @@ -200,10 +204,6 @@ on_run_maildir_msg (const char* fullpath, const char* mdir, if (result == MU_OK && data && data->_stats) { /* update statistics */ ++data->_stats->_processed; updated ? ++data->_stats->_updated : ++data->_stats->_uptodate; - /* if (updated) */ - /* ++data->_stats->_updated; */ - /* else */ - /* ++data->_stats->_uptodate; */ } return result; @@ -260,7 +260,7 @@ check_path (const char* path) static void init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian, - gboolean reindex, MuIndexStats *stats, + gboolean reindex, guint max_filesize, MuIndexStats *stats, MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb, void *user_data) { @@ -268,11 +268,12 @@ init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian, cb_data->_idx_dir_cb = dir_cb; cb_data->_user_data = user_data; - cb_data->_store = xapian; + cb_data->_store = xapian; cb_data->_reindex = reindex; cb_data->_dirstamp = 0; - + cb_data->_max_filesize = max_filesize; + cb_data->_stats = stats; if (cb_data->_stats) memset (cb_data->_stats, 0, sizeof(MuIndexStats)); @@ -298,6 +299,26 @@ mu_index_last_used_maildir (MuIndex *index) MU_LAST_USED_MAILDIR_KEY); } +void +mu_index_set_max_msg_size (MuIndex *index, guint max_size) +{ + g_return_if_fail (index); + + if (max_size == 0) + index->_max_filesize = MU_INDEX_MAX_FILE_SIZE; + else + index->_max_filesize = max_size; +} + +void +mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize) +{ + g_return_if_fail (index); + mu_store_set_batch_size (index->_store, xbatchsize); +} + + + MuResult mu_index_run (MuIndex *index, const char* path, @@ -319,7 +340,8 @@ mu_index_run (MuIndex *index, const char* path, return MU_ERROR; } - init_cb_data (&cb_data, index->_store, reindex, stats, + init_cb_data (&cb_data, index->_store, reindex, + index->_max_filesize, stats, msg_cb, dir_cb, user_data); update_last_used_maildir (index, path); @@ -400,15 +422,11 @@ typedef struct _CleanupData CleanupData; static MuResult foreach_doc_cb (const char* path, CleanupData *cudata) { - MuResult rv; - if (access (path, R_OK) != 0) { - - g_debug ("not readable: %s; removing", path); - rv = mu_store_remove (cudata->_store, path); - if (rv != MU_OK) - return rv; /* something went wrong... bail out */ - + if (errno != EACCES) + g_warning ("cannot access %s: %s", path, strerror(errno)); + if (!mu_store_remove (cudata->_store, path)) + return MU_ERROR; /* something went wrong... bail out */ if (cudata->_stats) ++cudata->_stats->_cleaned_up; } diff --git a/src/mu-index.h b/src/mu-index.h index 1d50f076..c0c090ef 100644 --- a/src/mu-index.h +++ b/src/mu-index.h @@ -46,13 +46,12 @@ typedef struct _MuIndexStats MuIndexStats; * * @param xpath path to the 'homedir'; the xapian directory will be * this homedir/xapian - * @param batchsize for Xapian queries, or 0 for the default * @param err to receive error or NULL; there are only errors when this * function returns NULL. Possible errors: see mu-error.h * * @return a new MuIndex instance, or NULL in case of error */ -MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err) +MuIndex* mu_index_new (const char* muhome, GError **err) G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; @@ -64,6 +63,28 @@ MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err) void mu_index_destroy (MuIndex *index); +/** + * change the maximum file size that mu-index considers from its + * default (MU_INDEX_MAX_FILE_SIZE). Note that the maximum size is a + * protection against mu (or the libraries it uses) allocating too + * much memory, which can lead to problems + * + * @param index a mu index object + * @param max_size the maximum msg size, or 0 to reset to the default + */ +void mu_index_set_max_msg_size (MuIndex *index, guint max_size); + + +/** + * change batch size for Xapian store transaction (see + * 'mu_store_set_batch_size') + * + * @param index a mu index object + * @param max_size the batch size, or 0 to reset to the default + */ +void mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize); + + /** * get the maildir for the last run of indexing for the * current database diff --git a/src/mu-store.cc b/src/mu-store.cc index 4cda56b1..ed0056e4 100644 --- a/src/mu-store.cc +++ b/src/mu-store.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2008-2010 Dirk-Jan C. Binnema +** Copyright (C) 2008-2011 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -119,7 +119,7 @@ check_version (MuStore *store) } MuStore* -mu_store_new (const char* xpath, guint batchsize, GError **err) +mu_store_new (const char* xpath, GError **err) { MuStore *store (0); @@ -139,7 +139,7 @@ mu_store_new (const char* xpath, guint batchsize, GError **err) /* keep count of processed docs */ store->_in_transaction = false; store->_processed = 0; - store->_trx_size = batchsize ? batchsize : MU_STORE_DEFAULT_TRX_SIZE; + store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE; add_synonyms (store); @@ -177,6 +177,18 @@ mu_store_destroy (MuStore *store) } +void +mu_store_set_batch_size (MuStore *store, guint batchsize) +{ + g_return_if_fail (store); + + if (batchsize == 0) + store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE; + else + store->_trx_size = batchsize; +} + + unsigned mu_store_count (MuStore *store) @@ -521,11 +533,11 @@ mu_store_store (MuStore *store, MuMsg *msg) } -MuResult -mu_store_remove (MuStore *store, const char* msgpath) +gboolean +mu_store_remove (MuStore *store, const char *msgpath) { - g_return_val_if_fail (store, MU_ERROR); - g_return_val_if_fail (msgpath, MU_ERROR); + g_return_val_if_fail (store, FALSE); + g_return_val_if_fail (msgpath, FALSE); try { const std::string uid (get_message_uid (msgpath)); @@ -533,18 +545,15 @@ mu_store_remove (MuStore *store, const char* msgpath) begin_trx_if (store, !store->_in_transaction); store->_db->delete_document (uid); - g_debug ("deleting %s", msgpath); - ++store->_processed; /* do we need to commit now? */ bool commit_now = store->_processed % store->_trx_size == 0; commit_trx_if (store, commit_now); - return MU_OK; + return TRUE; - } MU_XAPIAN_CATCH_BLOCK_RETURN (MU_ERROR); - + } MU_XAPIAN_CATCH_BLOCK_RETURN (FALSE); } gboolean diff --git a/src/mu-store.h b/src/mu-store.h index 78a8069b..53069b30 100644 --- a/src/mu-store.h +++ b/src/mu-store.h @@ -41,7 +41,7 @@ typedef struct _MuStore MuStore; * * @return a new MuStore object, or NULL in case of error */ -MuStore* mu_store_new (const char *path, guint batchsize, GError **err) +MuStore* mu_store_new (const char *path, GError **err) G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; @@ -53,6 +53,20 @@ MuStore* mu_store_new (const char *path, guint batchsize, GError **err) void mu_store_destroy (MuStore *store); + +/** + * set the Xapian batch size for this store. Normally, there's no need + * to use this function as the default is good enough; however, if you + * use mu in a very memory-constrained environment, you can set the + * batchsize to e.g. 1000 at the cost of significant slow-down. + * + * @param store a valid store object + * @param batchsize the new batch size; or 0 to reset to + * the default batch size + */ +void mu_store_set_batch_size (MuStore *store, guint batchsize); + + /** * get the numbers of documents in the database * @@ -104,8 +118,7 @@ MuResult mu_store_store (MuStore *store, MuMsg *msg); * * @return TRUE if it succeeded, FALSE otherwise */ -MuResult mu_store_remove (MuStore *store, - const char* msgpath); +gboolean mu_store_remove (MuStore *store, const char* msgpath); /** @@ -114,10 +127,9 @@ MuResult mu_store_remove (MuStore *store, * @param store a store * @param path the message path * - * @return + * @return TRUE if the message exists, FALSE otherwise */ -gboolean mu_store_contains_message (MuStore *store, - const char* path); +gboolean mu_store_contains_message (MuStore *store, const char* path); /** * store a timestamp for a directory diff --git a/src/tests/test-mu-cmd.c b/src/tests/test-mu-cmd.c index 237d9ec8..80cf45e1 100644 --- a/src/tests/test-mu-cmd.c +++ b/src/tests/test-mu-cmd.c @@ -110,7 +110,7 @@ test_mu_index (void) xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR, "xapian"); - store = mu_store_new (xpath, 0, NULL); + store = mu_store_new (xpath, NULL); g_assert (store); g_assert_cmpuint (mu_store_count (store), ==, 5); diff --git a/src/tests/test-mu-store.c b/src/tests/test-mu-store.c index 0511f65e..4329dae9 100644 --- a/src/tests/test-mu-store.c +++ b/src/tests/test-mu-store.c @@ -42,7 +42,7 @@ test_mu_store_new_destroy (void) g_assert (tmpdir); err = NULL; - store = mu_store_new (tmpdir, 12345, &err); + store = mu_store_new (tmpdir, &err); g_assert (store); g_assert (err == NULL); @@ -67,7 +67,7 @@ test_mu_store_version (void) g_assert (tmpdir); err = NULL; - store = mu_store_new (tmpdir, 789, &err); + store = mu_store_new (tmpdir, &err); g_assert (store); g_assert (err == NULL); @@ -90,7 +90,7 @@ test_mu_store_store_and_count (void) tmpdir = test_mu_common_get_random_tmpdir(); g_assert (tmpdir); - store = mu_store_new (tmpdir, 1, NULL); + store = mu_store_new (tmpdir, NULL); g_assert (store); g_assert_cmpuint (0,==,mu_store_count (store)); @@ -138,7 +138,7 @@ test_mu_store_store_remove_and_count (void) tmpdir = test_mu_common_get_random_tmpdir(); g_assert (tmpdir); - store = mu_store_new (tmpdir, 0, NULL); + store = mu_store_new (tmpdir, NULL); g_assert (store); g_assert_cmpuint (0,==,mu_store_count (store)); diff --git a/toys/mug2/mug.cc b/toys/mug2/mug.cc index 63ffbecf..28fdab0d 100644 --- a/toys/mug2/mug.cc +++ b/toys/mug2/mug.cc @@ -66,7 +66,7 @@ reindex (MugData *mugdata) return; err = NULL; - midx = mu_index_new (mu_runtime_xapian_dir(), 0, &err); + midx = mu_index_new (mu_runtime_xapian_dir(), &err); if (!midx) { if (err && err->code == MU_ERROR_XAPIAN_CANNOT_GET_WRITELOCK) { g_warning ("database busy...");