mirror of https://github.com/djcb/mu.git
* make mag-msg-store a configurable parameter; refactor the setting of the
xapian batch size
This commit is contained in:
parent
62acc7739d
commit
c76af05a7a
|
@ -90,18 +90,22 @@ set the maximum number of messages to process in a single Xapian
|
|||
transaction. In practice, this option is only useful if you find that \fBmu\fR
|
||||
is running out of memory while indexing; in that case, you can set the batch
|
||||
size to (for example) 1000, which will reduce memory consumption, but also
|
||||
reduce performance.
|
||||
substantially reduce the indexing performance.
|
||||
|
||||
.TP
|
||||
\fB\-\-max-msg-size\fR=\fI<max msg size>\fR
|
||||
set the maximum size (in bytes) for messages. The default maximum (currently
|
||||
at 50Mb) should be enough in most cases, but if you encounter warnings from
|
||||
\fBmu\fR about ignoring messsage because they are too big, you may want to
|
||||
increase this. Note that the reason for having a maximum size is that big
|
||||
message require big memory allocations, which may lead to problems.
|
||||
|
||||
.B NOTE:
|
||||
It is generally not a good idea to run multiple instances of \fBmu index\fR
|
||||
concurrently. No data loss should occur, but one or more of the instances may
|
||||
experience errors due to database locks.
|
||||
|
||||
Furthermore, it is not recommended tot mix maildirs and sub-maildirs within
|
||||
the hierarchy in the same database; for example, it's better not to index both
|
||||
with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as
|
||||
this may lead to unexpected results when searching with the the 'maildir:'
|
||||
search parameter (see below).
|
||||
It is not recommended tot mix maildirs and sub-maildirs within the hierarchy
|
||||
in the same database; for example, it's better not to index both with
|
||||
\fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as this
|
||||
may lead to unexpected results when searching with the the 'maildir:' search
|
||||
parameter (see below).
|
||||
|
||||
.SS A note on performance
|
||||
As a non-scientific benchmark, a simple test on the authors machine (a
|
||||
|
|
|
@ -94,7 +94,12 @@ check_index_or_cleanup_params (MuConfig *opts)
|
|||
g_warning ("the Xapian batch size must be non-negative");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
if (opts->max_msg_size < 0) {
|
||||
g_warning ("the maximum message size must be non-negative");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -346,10 +351,12 @@ cmd_index_or_cleanup (MuConfig *opts)
|
|||
return MU_EXITCODE_ERROR;
|
||||
|
||||
err = NULL;
|
||||
if (!(midx = mu_index_new
|
||||
(mu_runtime_xapian_dir(), opts->xbatchsize, &err)))
|
||||
if (!(midx = mu_index_new (mu_runtime_xapian_dir(), &err)))
|
||||
return handle_index_error_and_free (err);
|
||||
|
||||
|
||||
mu_index_set_max_msg_size (midx, opts->max_msg_size);
|
||||
mu_index_set_xbatch_size (midx, opts->xbatchsize);
|
||||
|
||||
/* we determine the maildir path only here, as it may depend on
|
||||
* mu_index_last_used_maildir
|
||||
*/
|
||||
|
|
|
@ -96,6 +96,8 @@ config_options_group_index (MuConfig * opts)
|
|||
"don't clean up the database after indexing (false)", NULL},
|
||||
{"xbatchsize", 0, 0, G_OPTION_ARG_INT, &opts->xbatchsize,
|
||||
"set transaction batchsize for xapian commits (0)", NULL},
|
||||
{"max-msg-size", 0, 0, G_OPTION_ARG_INT, &opts->max_msg_size,
|
||||
"set the maximum size for message files", NULL},
|
||||
{NULL, 0, 0, 0, NULL, NULL, NULL}
|
||||
};
|
||||
|
||||
|
|
|
@ -72,9 +72,10 @@ struct _MuConfig {
|
|||
gboolean rebuild; /* empty the database before indexing */
|
||||
gboolean autoupgrade; /* automatically upgrade db
|
||||
* when needed */
|
||||
int xbatchsize; /* batchsize for xapian
|
||||
int xbatchsize; /* batchsize for xapian
|
||||
* commits, or 0 for
|
||||
* default */
|
||||
int max_msg_size; /* maximum size for message files */
|
||||
|
||||
/* options for querying */
|
||||
gboolean xquery; /* (obsolete) give the Xapian
|
||||
|
|
|
@ -35,16 +35,17 @@
|
|||
#include "mu-util.h"
|
||||
|
||||
#define MU_LAST_USED_MAILDIR_KEY "last_used_maildir"
|
||||
#define MU_MAILDIR_WALK_MAX_FILE_SIZE (64*1000*1000)
|
||||
#define MU_INDEX_MAX_FILE_SIZE (50*1000*1000) /* 50 Mb */
|
||||
|
||||
struct _MuIndex {
|
||||
MuStore *_store;
|
||||
gboolean _needs_reindex;
|
||||
gchar *_last_used_maildir;
|
||||
guint _max_filesize;
|
||||
};
|
||||
|
||||
MuIndex*
|
||||
mu_index_new (const char *xpath, guint xbatchsize, GError **err)
|
||||
mu_index_new (const char *xpath, GError **err)
|
||||
{
|
||||
MuIndex *index;
|
||||
|
||||
|
@ -52,7 +53,7 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
|
|||
|
||||
index = g_new0 (MuIndex, 1);
|
||||
|
||||
index->_store = mu_store_new (xpath, xbatchsize, err);
|
||||
index->_store = mu_store_new (xpath, err);
|
||||
if (!index->_store) {
|
||||
g_warning ("%s: failed to open xapian store (%s)",
|
||||
__FUNCTION__, xpath);
|
||||
|
@ -60,6 +61,9 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* set the default max file size */
|
||||
index->_max_filesize = MU_INDEX_MAX_FILE_SIZE;
|
||||
|
||||
/* see we need to reindex the database; note, there is a small
|
||||
* race-condition here, between mu_index_new and
|
||||
* mu_index_run. Maybe do the check in mu_index_run
|
||||
|
@ -73,7 +77,6 @@ mu_index_new (const char *xpath, guint xbatchsize, GError **err)
|
|||
return index;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
mu_index_destroy (MuIndex *index)
|
||||
{
|
||||
|
@ -88,13 +91,14 @@ mu_index_destroy (MuIndex *index)
|
|||
|
||||
|
||||
struct _MuIndexCallbackData {
|
||||
MuIndexMsgCallback _idx_msg_cb;
|
||||
MuIndexDirCallback _idx_dir_cb;
|
||||
MuStore* _store;
|
||||
void* _user_data;
|
||||
MuIndexStats* _stats;
|
||||
gboolean _reindex;
|
||||
time_t _dirstamp;
|
||||
MuIndexMsgCallback _idx_msg_cb;
|
||||
MuIndexDirCallback _idx_dir_cb;
|
||||
MuStore* _store;
|
||||
void* _user_data;
|
||||
MuIndexStats* _stats;
|
||||
gboolean _reindex;
|
||||
time_t _dirstamp;
|
||||
guint _max_filesize;
|
||||
};
|
||||
typedef struct _MuIndexCallbackData MuIndexCallbackData;
|
||||
|
||||
|
@ -181,9 +185,9 @@ on_run_maildir_msg (const char* fullpath, const char* mdir,
|
|||
gboolean updated;
|
||||
|
||||
/* protect against too big messages */
|
||||
if (G_UNLIKELY(statbuf->st_size > MU_MAILDIR_WALK_MAX_FILE_SIZE)) {
|
||||
g_warning ("ignoring because bigger than %d bytes: %s",
|
||||
MU_MAILDIR_WALK_MAX_FILE_SIZE, fullpath);
|
||||
if (G_UNLIKELY(statbuf->st_size > data->_max_filesize)) {
|
||||
g_warning ("ignoring because bigger than %u bytes: %s",
|
||||
data->_max_filesize, fullpath);
|
||||
return MU_OK; /* not an error */
|
||||
}
|
||||
|
||||
|
@ -200,10 +204,6 @@ on_run_maildir_msg (const char* fullpath, const char* mdir,
|
|||
if (result == MU_OK && data && data->_stats) { /* update statistics */
|
||||
++data->_stats->_processed;
|
||||
updated ? ++data->_stats->_updated : ++data->_stats->_uptodate;
|
||||
/* if (updated) */
|
||||
/* ++data->_stats->_updated; */
|
||||
/* else */
|
||||
/* ++data->_stats->_uptodate; */
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -260,7 +260,7 @@ check_path (const char* path)
|
|||
|
||||
static void
|
||||
init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian,
|
||||
gboolean reindex, MuIndexStats *stats,
|
||||
gboolean reindex, guint max_filesize, MuIndexStats *stats,
|
||||
MuIndexMsgCallback msg_cb, MuIndexDirCallback dir_cb,
|
||||
void *user_data)
|
||||
{
|
||||
|
@ -268,11 +268,12 @@ init_cb_data (MuIndexCallbackData *cb_data, MuStore *xapian,
|
|||
cb_data->_idx_dir_cb = dir_cb;
|
||||
|
||||
cb_data->_user_data = user_data;
|
||||
cb_data->_store = xapian;
|
||||
cb_data->_store = xapian;
|
||||
|
||||
cb_data->_reindex = reindex;
|
||||
cb_data->_dirstamp = 0;
|
||||
|
||||
cb_data->_max_filesize = max_filesize;
|
||||
|
||||
cb_data->_stats = stats;
|
||||
if (cb_data->_stats)
|
||||
memset (cb_data->_stats, 0, sizeof(MuIndexStats));
|
||||
|
@ -298,6 +299,26 @@ mu_index_last_used_maildir (MuIndex *index)
|
|||
MU_LAST_USED_MAILDIR_KEY);
|
||||
}
|
||||
|
||||
void
|
||||
mu_index_set_max_msg_size (MuIndex *index, guint max_size)
|
||||
{
|
||||
g_return_if_fail (index);
|
||||
|
||||
if (max_size == 0)
|
||||
index->_max_filesize = MU_INDEX_MAX_FILE_SIZE;
|
||||
else
|
||||
index->_max_filesize = max_size;
|
||||
}
|
||||
|
||||
void
|
||||
mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize)
|
||||
{
|
||||
g_return_if_fail (index);
|
||||
mu_store_set_batch_size (index->_store, xbatchsize);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
MuResult
|
||||
mu_index_run (MuIndex *index, const char* path,
|
||||
|
@ -319,7 +340,8 @@ mu_index_run (MuIndex *index, const char* path,
|
|||
return MU_ERROR;
|
||||
}
|
||||
|
||||
init_cb_data (&cb_data, index->_store, reindex, stats,
|
||||
init_cb_data (&cb_data, index->_store, reindex,
|
||||
index->_max_filesize, stats,
|
||||
msg_cb, dir_cb, user_data);
|
||||
|
||||
update_last_used_maildir (index, path);
|
||||
|
@ -400,15 +422,11 @@ typedef struct _CleanupData CleanupData;
|
|||
static MuResult
|
||||
foreach_doc_cb (const char* path, CleanupData *cudata)
|
||||
{
|
||||
MuResult rv;
|
||||
|
||||
if (access (path, R_OK) != 0) {
|
||||
|
||||
g_debug ("not readable: %s; removing", path);
|
||||
rv = mu_store_remove (cudata->_store, path);
|
||||
if (rv != MU_OK)
|
||||
return rv; /* something went wrong... bail out */
|
||||
|
||||
if (errno != EACCES)
|
||||
g_warning ("cannot access %s: %s", path, strerror(errno));
|
||||
if (!mu_store_remove (cudata->_store, path))
|
||||
return MU_ERROR; /* something went wrong... bail out */
|
||||
if (cudata->_stats)
|
||||
++cudata->_stats->_cleaned_up;
|
||||
}
|
||||
|
|
|
@ -46,13 +46,12 @@ typedef struct _MuIndexStats MuIndexStats;
|
|||
*
|
||||
* @param xpath path to the 'homedir'; the xapian directory will be
|
||||
* this homedir/xapian
|
||||
* @param batchsize for Xapian queries, or 0 for the default
|
||||
* @param err to receive error or NULL; there are only errors when this
|
||||
* function returns NULL. Possible errors: see mu-error.h
|
||||
*
|
||||
* @return a new MuIndex instance, or NULL in case of error
|
||||
*/
|
||||
MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
|
||||
MuIndex* mu_index_new (const char* muhome, GError **err)
|
||||
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
||||
|
||||
|
||||
|
@ -64,6 +63,28 @@ MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
|
|||
void mu_index_destroy (MuIndex *index);
|
||||
|
||||
|
||||
/**
|
||||
* change the maximum file size that mu-index considers from its
|
||||
* default (MU_INDEX_MAX_FILE_SIZE). Note that the maximum size is a
|
||||
* protection against mu (or the libraries it uses) allocating too
|
||||
* much memory, which can lead to problems
|
||||
*
|
||||
* @param index a mu index object
|
||||
* @param max_size the maximum msg size, or 0 to reset to the default
|
||||
*/
|
||||
void mu_index_set_max_msg_size (MuIndex *index, guint max_size);
|
||||
|
||||
|
||||
/**
|
||||
* change batch size for Xapian store transaction (see
|
||||
* 'mu_store_set_batch_size')
|
||||
*
|
||||
* @param index a mu index object
|
||||
* @param max_size the batch size, or 0 to reset to the default
|
||||
*/
|
||||
void mu_index_set_xbatch_size (MuIndex *index, guint xbatchsize);
|
||||
|
||||
|
||||
/**
|
||||
* get the maildir for the last run of indexing for the
|
||||
* current database
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
** Copyright (C) 2008-2010 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2008-2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
|
@ -119,7 +119,7 @@ check_version (MuStore *store)
|
|||
}
|
||||
|
||||
MuStore*
|
||||
mu_store_new (const char* xpath, guint batchsize, GError **err)
|
||||
mu_store_new (const char* xpath, GError **err)
|
||||
{
|
||||
MuStore *store (0);
|
||||
|
||||
|
@ -139,7 +139,7 @@ mu_store_new (const char* xpath, guint batchsize, GError **err)
|
|||
/* keep count of processed docs */
|
||||
store->_in_transaction = false;
|
||||
store->_processed = 0;
|
||||
store->_trx_size = batchsize ? batchsize : MU_STORE_DEFAULT_TRX_SIZE;
|
||||
store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE;
|
||||
|
||||
add_synonyms (store);
|
||||
|
||||
|
@ -177,6 +177,18 @@ mu_store_destroy (MuStore *store)
|
|||
}
|
||||
|
||||
|
||||
void
|
||||
mu_store_set_batch_size (MuStore *store, guint batchsize)
|
||||
{
|
||||
g_return_if_fail (store);
|
||||
|
||||
if (batchsize == 0)
|
||||
store->_trx_size = MU_STORE_DEFAULT_TRX_SIZE;
|
||||
else
|
||||
store->_trx_size = batchsize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
unsigned
|
||||
mu_store_count (MuStore *store)
|
||||
|
@ -521,11 +533,11 @@ mu_store_store (MuStore *store, MuMsg *msg)
|
|||
}
|
||||
|
||||
|
||||
MuResult
|
||||
mu_store_remove (MuStore *store, const char* msgpath)
|
||||
gboolean
|
||||
mu_store_remove (MuStore *store, const char *msgpath)
|
||||
{
|
||||
g_return_val_if_fail (store, MU_ERROR);
|
||||
g_return_val_if_fail (msgpath, MU_ERROR);
|
||||
g_return_val_if_fail (store, FALSE);
|
||||
g_return_val_if_fail (msgpath, FALSE);
|
||||
|
||||
try {
|
||||
const std::string uid (get_message_uid (msgpath));
|
||||
|
@ -533,18 +545,15 @@ mu_store_remove (MuStore *store, const char* msgpath)
|
|||
begin_trx_if (store, !store->_in_transaction);
|
||||
|
||||
store->_db->delete_document (uid);
|
||||
g_debug ("deleting %s", msgpath);
|
||||
|
||||
++store->_processed;
|
||||
|
||||
/* do we need to commit now? */
|
||||
bool commit_now = store->_processed % store->_trx_size == 0;
|
||||
commit_trx_if (store, commit_now);
|
||||
|
||||
return MU_OK;
|
||||
return TRUE;
|
||||
|
||||
} MU_XAPIAN_CATCH_BLOCK_RETURN (MU_ERROR);
|
||||
|
||||
} MU_XAPIAN_CATCH_BLOCK_RETURN (FALSE);
|
||||
}
|
||||
|
||||
gboolean
|
||||
|
|
|
@ -41,7 +41,7 @@ typedef struct _MuStore MuStore;
|
|||
*
|
||||
* @return a new MuStore object, or NULL in case of error
|
||||
*/
|
||||
MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
|
||||
MuStore* mu_store_new (const char *path, GError **err)
|
||||
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
||||
|
||||
|
||||
|
@ -53,6 +53,20 @@ MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
|
|||
void mu_store_destroy (MuStore *store);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* set the Xapian batch size for this store. Normally, there's no need
|
||||
* to use this function as the default is good enough; however, if you
|
||||
* use mu in a very memory-constrained environment, you can set the
|
||||
* batchsize to e.g. 1000 at the cost of significant slow-down.
|
||||
*
|
||||
* @param store a valid store object
|
||||
* @param batchsize the new batch size; or 0 to reset to
|
||||
* the default batch size
|
||||
*/
|
||||
void mu_store_set_batch_size (MuStore *store, guint batchsize);
|
||||
|
||||
|
||||
/**
|
||||
* get the numbers of documents in the database
|
||||
*
|
||||
|
@ -104,8 +118,7 @@ MuResult mu_store_store (MuStore *store, MuMsg *msg);
|
|||
*
|
||||
* @return TRUE if it succeeded, FALSE otherwise
|
||||
*/
|
||||
MuResult mu_store_remove (MuStore *store,
|
||||
const char* msgpath);
|
||||
gboolean mu_store_remove (MuStore *store, const char* msgpath);
|
||||
|
||||
|
||||
/**
|
||||
|
@ -114,10 +127,9 @@ MuResult mu_store_remove (MuStore *store,
|
|||
* @param store a store
|
||||
* @param path the message path
|
||||
*
|
||||
* @return
|
||||
* @return TRUE if the message exists, FALSE otherwise
|
||||
*/
|
||||
gboolean mu_store_contains_message (MuStore *store,
|
||||
const char* path);
|
||||
gboolean mu_store_contains_message (MuStore *store, const char* path);
|
||||
|
||||
/**
|
||||
* store a timestamp for a directory
|
||||
|
|
|
@ -110,7 +110,7 @@ test_mu_index (void)
|
|||
|
||||
xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR, "xapian");
|
||||
|
||||
store = mu_store_new (xpath, 0, NULL);
|
||||
store = mu_store_new (xpath, NULL);
|
||||
g_assert (store);
|
||||
|
||||
g_assert_cmpuint (mu_store_count (store), ==, 5);
|
||||
|
|
|
@ -42,7 +42,7 @@ test_mu_store_new_destroy (void)
|
|||
g_assert (tmpdir);
|
||||
|
||||
err = NULL;
|
||||
store = mu_store_new (tmpdir, 12345, &err);
|
||||
store = mu_store_new (tmpdir, &err);
|
||||
g_assert (store);
|
||||
g_assert (err == NULL);
|
||||
|
||||
|
@ -67,7 +67,7 @@ test_mu_store_version (void)
|
|||
g_assert (tmpdir);
|
||||
|
||||
err = NULL;
|
||||
store = mu_store_new (tmpdir, 789, &err);
|
||||
store = mu_store_new (tmpdir, &err);
|
||||
g_assert (store);
|
||||
g_assert (err == NULL);
|
||||
|
||||
|
@ -90,7 +90,7 @@ test_mu_store_store_and_count (void)
|
|||
tmpdir = test_mu_common_get_random_tmpdir();
|
||||
g_assert (tmpdir);
|
||||
|
||||
store = mu_store_new (tmpdir, 1, NULL);
|
||||
store = mu_store_new (tmpdir, NULL);
|
||||
g_assert (store);
|
||||
|
||||
g_assert_cmpuint (0,==,mu_store_count (store));
|
||||
|
@ -138,7 +138,7 @@ test_mu_store_store_remove_and_count (void)
|
|||
tmpdir = test_mu_common_get_random_tmpdir();
|
||||
g_assert (tmpdir);
|
||||
|
||||
store = mu_store_new (tmpdir, 0, NULL);
|
||||
store = mu_store_new (tmpdir, NULL);
|
||||
g_assert (store);
|
||||
|
||||
g_assert_cmpuint (0,==,mu_store_count (store));
|
||||
|
|
|
@ -66,7 +66,7 @@ reindex (MugData *mugdata)
|
|||
return;
|
||||
|
||||
err = NULL;
|
||||
midx = mu_index_new (mu_runtime_xapian_dir(), 0, &err);
|
||||
midx = mu_index_new (mu_runtime_xapian_dir(), &err);
|
||||
if (!midx) {
|
||||
if (err && err->code == MU_ERROR_XAPIAN_CANNOT_GET_WRITELOCK) {
|
||||
g_warning ("database busy...");
|
||||
|
|
Loading…
Reference in New Issue