* make mag-msg-store a configurable parameter; refactor the setting of the

xapian batch size
This commit is contained in:
Dirk-Jan C. Binnema 2011-01-15 13:27:41 +02:00
parent 62acc7739d
commit c76af05a7a
11 changed files with 145 additions and 71 deletions

View File

@ -90,18 +90,22 @@ set the maximum number of messages to process in a single Xapian
transaction. In practice, this option is only useful if you find that \fBmu\fR
is running out of memory while indexing; in that case, you can set the batch
size to (for example) 1000, which will reduce memory consumption, but also
reduce performance.
substantially reduce the indexing performance.
.TP
\fB\-\-max-msg-size\fR=\fI<max msg size>\fR
set the maximum size (in bytes) for messages. The default maximum (currently
at 50Mb) should be enough in most cases, but if you encounter warnings from
\fBmu\fR about ignoring messsage because they are too big, you may want to
increase this. Note that the reason for having a maximum size is that big
message require big memory allocations, which may lead to problems.
.B NOTE:
It is generally not a good idea to run multiple instances of \fBmu index\fR
concurrently. No data loss should occur, but one or more of the instances may
experience errors due to database locks.
Furthermore, it is not recommended tot mix maildirs and sub-maildirs within
the hierarchy in the same database; for example, it's better not to index both
with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as
this may lead to unexpected results when searching with the the 'maildir:'
search parameter (see below).
It is not recommended tot mix maildirs and sub-maildirs within the hierarchy
in the same database; for example, it's better not to index both with
\fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as this
may lead to unexpected results when searching with the the 'maildir:' search
parameter (see below).
.SS A note on performance
As a non-scientific benchmark, a simple test on the authors machine (a

View File

@ -94,7 +94,12 @@ check_index_or_cleanup_params (MuConfig *opts)
g_warning ("the Xapian batch size must be non-negative");
return FALSE;
}
if (opts->max_msg_size < 0) {
g_warning ("the maximum message size must be non-negative");
return FALSE;
}
return TRUE;
}
@ -346,10 +351,12 @@ cmd_index_or_cleanup (MuConfig *opts)
return MU_EXITCODE_ERROR;
err = NULL;
if (!(midx = mu_index_new
(mu_runtime_xapian_dir(), opts->xbatchsize, &err)))
if (!(midx = mu_index_new (mu_runtime_xapian_dir(), &err)))
return handle_index_error_and_free (err);
mu_index_set_max_msg_size (midx, opts->max_msg_size);
mu_index_set_xbatch_size (midx, opts->xbatchsize);
/* we determine the maildir path only here, as it may depend on
* mu_index_last_used_maildir
*/

View File

@ -96,6 +96,8 @@ config_options_group_index (MuConfig * opts)
"don't clean up the database after indexing (false)", NULL},
{"xbatchsize", 0, 0, G_OPTION_ARG_INT, &opts->xbatchsize,
"set transaction batchsize for xapian commits (0)", NULL},
{"max-msg-size", 0, 0, G_OPTION_ARG_INT, &opts->max_msg_size,
"set the maximum size for message files", NULL},
{NULL, 0, 0, 0, NULL, NULL, NULL}
};

View File

@ -72,9 +72,10 @@ struct _MuConfig {
gboolean rebuild; /* empty the database before indexing */
gboolean autoupgrade; /* automatically upgrade db
* when needed */
int xbatchsize; /* batchsize for xapian
int xbatchsize; /* batchsize for xapian
* commits, or 0 for
* default */
int max_msg_size; /* maximum size for message files */
/* options for querying */
gboolean xquery; /* (obsolete) give the Xapian

View File

@ -35,16 +35,17 @@
#include "mu-util.h"
#define MU_LAST_USED_MAILDIR_KEY "last_used_maildir"
#define MU_MAILDIR_WALK_MAX_FILE_SIZE (64*1000*1000)
#define MU_INDEX_MAX_FILE_SIZE (50*1000*1000) /* 50 Mb */
struct _MuIndex {
MuStore *_store;
gboolean _needs_reindex;
gchar *_last_used_maildir;
guint _max_filesize;
};
MuIndex*
mu_index_new (const char *xpath, guint xbatchsize, GError **err)
mu_index_new (const char *xpath, GError **err)
{
MuIndex *index;
@ -52,7 +53,7 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
index = g_new0 (MuIndex, 1);
index->_store = mu_store_new (xpath, xbatchsize, err);
index->_store = mu_store_new (xpath, err);
if (!index->_store) {
g_warning ("%s: failed to open xapian store (%s)",
__FUNCTION__, xpath);
@ -60,6 +61,9 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
return NULL;
}
/* set the default max file size */
index->_max_filesize = MU_INDEX_MAX_FILE_SIZE;
/* see we need to reindex the database; note, there is a small
* race-condition here, between mu_index_new and
* mu_index_run. Maybe do the check in mu_index_run
@ -73,7 +77,6 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
return index;
}
void
mu_index_destroy (MuIndex *index)
{
@ -88,13 +91,14 @@ mu_index_destroy (MuIndex *index)
struct _MuIndexCallbackData {
MuIndexMsgCallback _idx_msg_cb;
MuIndexDirCallback _idx_dir_cb;
MuStore* _store;
void* _user_data;
MuIndexStats* _stats;
gboolean _reindex;
time_t _dirstamp;
MuIndexMsgCallback _idx_msg_cb;
MuIndexDirCallback _idx_dir_cb;
MuStore* _store;
void* _user_data;
MuIndexStats* _stats;
gboolean _reindex;
time_t _dirstamp;
guint _max_filesize;
};
typedef struct _MuIndexCallbackData MuIndexCallbackData;
@ -181,9 +185,9 @@ on_run_maildir_msg (const char* fullpath, const char* mdir,
gboolean updated;
/* protect against too big messages */
if (G_UNLIKELY(statbuf->st_size > MU_MAILDIR_WALK_MAX_FILE_SIZE)) {
g_warning ("ignoring because bigger than %d bytes: %s",
MU_MAILDIR_WALK_MAX_FILE_SIZE, fullpath);
if (G_UNLIKELY(statbuf->st_size > data->_max_filesize)) {
g_warning ("ignoring because bigger than %u bytes: %s",
data->_max_filesize, fullpath);
return MU_OK; /* not an error */
}
@ -200,10 +204,6 @@ on_run_maildir_msg (const char* fullpath, const char* mdir,
if (result == MU_OK && data && data->_stats) { /* update statistics */
++data->_stats->_processed;
updated ? ++data->_stats->_updated : ++data->_stats->_uptodate;
/* if (updated) */
/* ++data->_stats->_updated; */
/* else */
/* ++data->_stats->_uptodate; */
}
return result;
@ -260,7 +260,7 @@ check_path (const char* path)
static void
init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian,
gboolean reindex, MuIndexStats *stats,
gboolean reindex, guint max_filesize, MuIndexStats *stats,
MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb,
void *user_data)
{
@ -268,11 +268,12 @@ init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian,
cb_data->_idx_dir_cb = dir_cb;
cb_data->_user_data = user_data;
cb_data->_store = xapian;
cb_data->_store = xapian;
cb_data->_reindex = reindex;
cb_data->_dirstamp = 0;
cb_data->_max_filesize = max_filesize;
cb_data->_stats = stats;
if (cb_data->_stats)
memset (cb_data->_stats, 0, sizeof(MuIndexStats));
@ -298,6 +299,26 @@ mu_index_last_used_maildir (MuIndex *index)
MU_LAST_USED_MAILDIR_KEY);
}
void
mu_index_set_max_msg_size (MuIndex *index, guint max_size)
{
g_return_if_fail (index);
if (max_size == 0)
index->_max_filesize = MU_INDEX_MAX_FILE_SIZE;
else
index->_max_filesize = max_size;
}
void
mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize)
{
g_return_if_fail (index);
mu_store_set_batch_size (index->_store, xbatchsize);
}
MuResult
mu_index_run (MuIndex *index, const char* path,
@ -319,7 +340,8 @@ mu_index_run (MuIndex *index, const char* path,
return MU_ERROR;
}
init_cb_data (&cb_data, index->_store, reindex, stats,
init_cb_data (&cb_data, index->_store, reindex,
index->_max_filesize, stats,
msg_cb, dir_cb, user_data);
update_last_used_maildir (index, path);
@ -400,15 +422,11 @@ typedef struct _CleanupData CleanupData;
static MuResult
foreach_doc_cb (const char* path, CleanupData *cudata)
{
MuResult rv;
if (access (path, R_OK) != 0) {
g_debug ("not readable: %s; removing", path);
rv = mu_store_remove (cudata->_store, path);
if (rv != MU_OK)
return rv; /* something went wrong... bail out */
if (errno != EACCES)
g_warning ("cannot access %s: %s", path, strerror(errno));
if (!mu_store_remove (cudata->_store, path))
return MU_ERROR; /* something went wrong... bail out */
if (cudata->_stats)
++cudata->_stats->_cleaned_up;
}

View File

@ -46,13 +46,12 @@ typedef struct _MuIndexStats MuIndexStats;
*
* @param xpath path to the 'homedir'; the xapian directory will be
* this homedir/xapian
* @param batchsize for Xapian queries, or 0 for the default
* @param err to receive error or NULL; there are only errors when this
* function returns NULL. Possible errors: see mu-error.h
*
* @return a new MuIndex instance, or NULL in case of error
*/
MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
MuIndex* mu_index_new (const char* muhome, GError **err)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
@ -64,6 +63,28 @@ MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
void mu_index_destroy (MuIndex *index);
/**
* change the maximum file size that mu-index considers from its
* default (MU_INDEX_MAX_FILE_SIZE). Note that the maximum size is a
* protection against mu (or the libraries it uses) allocating too
* much memory, which can lead to problems
*
* @param index a mu index object
* @param max_size the maximum msg size, or 0 to reset to the default
*/
void mu_index_set_max_msg_size (MuIndex *index, guint max_size);
/**
* change batch size for Xapian store transaction (see
* 'mu_store_set_batch_size')
*
* @param index a mu index object
* @param max_size the batch size, or 0 to reset to the default
*/
void mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize);
/**
* get the maildir for the last run of indexing for the
* current database

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2008-2010 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2008-2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
@ -119,7 +119,7 @@ check_version (MuStore *store)
}
MuStore*
mu_store_new (const char* xpath, guint batchsize, GError **err)
mu_store_new (const char* xpath, GError **err)
{
MuStore *store (0);
@ -139,7 +139,7 @@ mu_store_new (const char* xpath, guint batchsize, GError **err)
/* keep count of processed docs */
store->_in_transaction = false;
store->_processed = 0;
store->_trx_size = batchsize ? batchsize : MU_STORE_DEFAULT_TRX_SIZE;
store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE;
add_synonyms (store);
@ -177,6 +177,18 @@ mu_store_destroy (MuStore *store)
}
void
mu_store_set_batch_size (MuStore *store, guint batchsize)
{
g_return_if_fail (store);
if (batchsize == 0)
store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE;
else
store->_trx_size = batchsize;
}
unsigned
mu_store_count (MuStore *store)
@ -521,11 +533,11 @@ mu_store_store (MuStore *store, MuMsg *msg)
}
MuResult
mu_store_remove (MuStore *store, const char* msgpath)
gboolean
mu_store_remove (MuStore *store, const char *msgpath)
{
g_return_val_if_fail (store, MU_ERROR);
g_return_val_if_fail (msgpath, MU_ERROR);
g_return_val_if_fail (store, FALSE);
g_return_val_if_fail (msgpath, FALSE);
try {
const std::string uid (get_message_uid (msgpath));
@ -533,18 +545,15 @@ mu_store_remove (MuStore *store, const char* msgpath)
begin_trx_if (store, !store->_in_transaction);
store->_db->delete_document (uid);
g_debug ("deleting %s", msgpath);
++store->_processed;
/* do we need to commit now? */
bool commit_now = store->_processed % store->_trx_size == 0;
commit_trx_if (store, commit_now);
return MU_OK;
return TRUE;
} MU_XAPIAN_CATCH_BLOCK_RETURN (MU_ERROR);
} MU_XAPIAN_CATCH_BLOCK_RETURN (FALSE);
}
gboolean

View File

@ -41,7 +41,7 @@ typedef struct _MuStore MuStore;
*
* @return a new MuStore object, or NULL in case of error
*/
MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
MuStore* mu_store_new (const char *path, GError **err)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
@ -53,6 +53,20 @@ MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
void mu_store_destroy (MuStore *store);
/**
* set the Xapian batch size for this store. Normally, there's no need
* to use this function as the default is good enough; however, if you
* use mu in a very memory-constrained environment, you can set the
* batchsize to e.g. 1000 at the cost of significant slow-down.
*
* @param store a valid store object
* @param batchsize the new batch size; or 0 to reset to
* the default batch size
*/
void mu_store_set_batch_size (MuStore *store, guint batchsize);
/**
* get the numbers of documents in the database
*
@ -104,8 +118,7 @@ MuResult mu_store_store (MuStore *store, MuMsg *msg);
*
* @return TRUE if it succeeded, FALSE otherwise
*/
MuResult mu_store_remove (MuStore *store,
const char* msgpath);
gboolean mu_store_remove (MuStore *store, const char* msgpath);
/**
@ -114,10 +127,9 @@ MuResult mu_store_remove (MuStore *store,
* @param store a store
* @param path the message path
*
* @return
* @return TRUE if the message exists, FALSE otherwise
*/
gboolean mu_store_contains_message (MuStore *store,
const char* path);
gboolean mu_store_contains_message (MuStore *store, const char* path);
/**
* store a timestamp for a directory

View File

@ -110,7 +110,7 @@ test_mu_index (void)
xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR, "xapian");
store = mu_store_new (xpath, 0, NULL);
store = mu_store_new (xpath, NULL);
g_assert (store);
g_assert_cmpuint (mu_store_count (store), ==, 5);

View File

@ -42,7 +42,7 @@ test_mu_store_new_destroy (void)
g_assert (tmpdir);
err = NULL;
store = mu_store_new (tmpdir, 12345, &err);
store = mu_store_new (tmpdir, &err);
g_assert (store);
g_assert (err == NULL);
@ -67,7 +67,7 @@ test_mu_store_version (void)
g_assert (tmpdir);
err = NULL;
store = mu_store_new (tmpdir, 789, &err);
store = mu_store_new (tmpdir, &err);
g_assert (store);
g_assert (err == NULL);
@ -90,7 +90,7 @@ test_mu_store_store_and_count (void)
tmpdir = test_mu_common_get_random_tmpdir();
g_assert (tmpdir);
store = mu_store_new (tmpdir, 1, NULL);
store = mu_store_new (tmpdir, NULL);
g_assert (store);
g_assert_cmpuint (0,==,mu_store_count (store));
@ -138,7 +138,7 @@ test_mu_store_store_remove_and_count (void)
tmpdir = test_mu_common_get_random_tmpdir();
g_assert (tmpdir);
store = mu_store_new (tmpdir, 0, NULL);
store = mu_store_new (tmpdir, NULL);
g_assert (store);
g_assert_cmpuint (0,==,mu_store_count (store));

View File

@ -66,7 +66,7 @@ reindex (MugData *mugdata)
return;
err = NULL;
midx = mu_index_new (mu_runtime_xapian_dir(), 0, &err);
midx = mu_index_new (mu_runtime_xapian_dir(), &err);
if (!midx) {
if (err && err->code == MU_ERROR_XAPIAN_CANNOT_GET_WRITELOCK) {
g_warning ("database busy...");