* <many>: add option to change the batch size for xapian transactions

This commit is contained in:
Dirk-Jan C. Binnema 2011-01-02 18:05:43 +02:00
parent 0b88f86e65
commit 169196498e
8 changed files with 65 additions and 46 deletions

View File

@ -1,4 +1,4 @@
.TH MU-INDEX 1 "November 2010" "User Manuals"
.TH MU-INDEX 1 "January 2011" "User Manuals"
.SH NAME
@ -10,11 +10,11 @@ mu index \- index e-mail messages stored in Maildirs
.SH DESCRIPTION
\fBmu index\fR is the \fBmu\fR sub-command for scanning the contents of
Maildir directories and storing the results in a Xapian database which can
then be searched using
\fBmu index\fR is the \fBmu\fR command for scanning the contents of Maildir
directories and storing the results in a Xapian database. The data can then be
queried using
.BR mu-find(1)
\.
\.
.B index
understands Maildirs as defined by Daniel Bernstein for qmail(7). In addition,
@ -34,17 +34,14 @@ with spam-messages.
The first run of \fBmu index\fR may take a few minutes if you have a lot of
mail (ten thousands of messages). Fortunately, such a full scan needs to be
done only once, after that it suffices to index the changes, which goes much
faster. Also note that a substantial amount of the time goes to printing the
progress information; if you turn that off (with \fB\-q\fR or
\fB\-\-quiet\fR), it goes a lot faster. See the 'Note on performance' below
for more information.
done only once; after that it suffices to index the changes, which goes much
faster. See the 'Note on performance' below for more information.
The optional phase two of the indexing-process is the removal of messages from
the database for which there is no longer a corresponding file in the
The optional 'phase two' of the indexing-process is the removal of messages
from the database for which there is no longer a corresponding file in the
Maildir. If you do not want this, you can use \fB\-n\fR, \fB\-\-nocleanup\fR.
When \fBmu index\fR catches on of the signals \fBSIGINT\fR, \fBSIGHUP\fR or
When \fBmu index\fR catches one of the signals \fBSIGINT\fR, \fBSIGHUP\fR or
\fBSIGTERM\fR (e.g,, when you press Ctrl-C during the indexing process), it
tries to shutdown gracefully; it tries to save and commit data, and close the
database etc. If it receives another signal (e.g,, when pressing Ctrl-C once
@ -52,16 +49,14 @@ more), \fBmu index\fR will terminate immediately.
.SH OPTIONS
Note, some of the important options are described in the \fBmu(1)\fR man-page
and not here, as they apply to multiple mu-commands.
Note, some of the general options are described in the \fBmu(1)\fR man-page
and not here, as they apply to multiple mu commands.
.TP
\fB\-m\fR, \fB\-\-maildir\fR=\fI<maildir>\fR
starts searching at \fI<maildir>\fR. By default, \fBmu\fR uses whatever the
\fBMAILDIR\fR environment variable is set to; if that is not set, it tries
\fI~/Maildir\fR \. In either case, the path must be \fBabsolute\fR.
Also please see the note on mixing sub-maildirs below.
\fBMAILDIR\fR environment variable is set to; if it is not set, it tries
\fI~/Maildir\fR. See the note on mixing sub-maildirs below.
.TP
\fB\-\-reindex\fR
@ -83,23 +78,25 @@ messages (using \fB\-\-maildir\fR). For this reason, it is necessary to run
format. \fBmu index\fR will issue a warning about this.
.TP
\fB\-\-autoupgrade\fR automatically use \fB\-y\fR, \fB\-\-empty\fR
\fB\-\-autoupgrade\fR
automatically use \fB\-y\fR, \fB\-\-empty\fR
when \fBmu\fR notices that the database version is not up-to-date. This option
is for use in cron scripts and the like, so they won't require any user
interaction, even when mu introduces a new database version.
.TP
\fB\-\-xbatchsize\fR=\fI<batch size>\fR
set the maximum number of messages to process in a single Xapian
transaction. In practice, this option is only useful if you find that \fBmu\fR
is running out of memory while indexing; in that case, you can set the batch
size to (for example) 1000, which will reduce memory consumption, but also
reduce performance.
.B NOTE:
It is not a good idea to run multiple instances of
.B mu index
It is generally not a good idea to run multiple instances of \fBmu index\fR
concurrently. No data loss should occur, but one or more of the instances may
experience errors due to database locks.
Also note that, before indexing is completed, searches for messages may fail,
even if they have already been indexed, as some of the esssential database
information will only be written in batches during the indexing process.
Furthermore, it is not recommended tot mix maildirs and sub-maildirs within
the hierarchy in the same database; for example, it's better not to index both
with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as
@ -179,6 +176,4 @@ Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
.SH "SEE ALSO"
.BR maildir(5)
.BR mu(1)
.BR mu-find(1)
.BR maildir(5) mu(1) mu-find(1)

View File

@ -27,10 +27,28 @@
G_BEGIN_DECLS
enum _MuConfigCmd {
MU_CONFIG_CMD_INDEX,
MU_CONFIG_CMD_FIND,
MU_CONFIG_CMD_CLEANUP,
MU_CONFIG_CMD_MKDIR,
MU_CONFIG_CMD_VIEW,
MU_CONFIG_CMD_EXTRACT,
MU_CONFIG_CMD_NONE,
MU_CONFIG_CMD_UNKNOWN
};
typedef enum _MuConfigCmd MuConfigCmd;
/* struct with all configuration options for mu; it will be filled
* from the config file, and/or command line arguments */
struct _MuConfigOptions {
MuConfigCmd cmd; /* the command, or MU_CONFIG_CMD_NONE */
const char *cmdstr; /* cmd string, for user info */
/* general options */
gboolean quiet; /* don't give any output */
@ -47,6 +65,9 @@ struct _MuConfigOptions {
gboolean rebuild; /* empty the database before indexing */
gboolean autoupgrade; /* automatically upgrade db
* when needed */
int xbatchsize; /* batchsize for xapian commits, or 0 for default
* */
/* options for querying */
gboolean xquery; /* give the Xapian query instead of
search results */

View File

@ -41,14 +41,14 @@ struct _MuIndex {
};
MuIndex*
mu_index_new (const char *xpath, GError **err)
mu_index_new (const char *xpath, guint xbatchsize, GError **err)
{
MuIndex *index;
g_return_val_if_fail (xpath, NULL);
index = g_new0 (MuIndex, 1);
index->_xapian = mu_store_new (xpath, err);
index->_xapian = mu_store_new (xpath, xbatchsize, err);
if (!index->_xapian) {
g_warning ("%s: failed to open xapian store (%s)",

View File

@ -51,7 +51,7 @@ typedef struct _MuIndexStats MuIndexStats;
*
* @return a new MuIndex instance, or NULL in case of error
*/
MuIndex* mu_index_new (const char* muhome, GError **err)
MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

View File

@ -32,8 +32,8 @@
#include "mu-str.h"
#include "mu-msg-flags.h"
/* number of new messages after which we commit to the database */
#define MU_STORE_TRX_SIZE 6666
/* by default, use transactions of 30000 messages */
#define MU_STORE_DEFAULT_TRX_SIZE 30000
/* http://article.gmane.org/gmane.comp.search.xapian.general/3656 */
#define MU_STORE_MAX_TERM_LENGTH 240
@ -47,6 +47,7 @@ struct _MuStore {
bool _in_transaction;
int _processed;
size_t _trx_size;
guint _batchsize; /* batch size of a xapian transaction */
};
@ -115,7 +116,7 @@ check_version (MuStore *store)
}
MuStore*
mu_store_new (const char* xpath, GError **err)
mu_store_new (const char* xpath, guint batchsize, GError **err)
{
MuStore *store (0);
@ -131,14 +132,15 @@ mu_store_new (const char* xpath, GError **err)
}
/* keep count of processed docs */
store->_trx_size = MU_STORE_TRX_SIZE;
store->_in_transaction = false;
store->_processed = 0;
store->_processed = 0;
store->_trx_size = batchsize ? batchsize : MU_STORE_DEFAULT_TRX_SIZE;
add_synonyms (store);
MU_WRITE_LOG ("%s: opened %s", __FUNCTION__, xpath);
MU_WRITE_LOG ("%s: opened %s (batch size: %u)",
__FUNCTION__, xpath, store->_trx_size);
return store;
} MU_XAPIAN_CATCH_BLOCK_G_ERROR(err,MU_ERROR_XAPIAN);

View File

@ -36,12 +36,13 @@ typedef struct _MuStore MuStore;
* create a new Xapian store, a place to store documents
*
* @param path the path to the database
* @param err to receive error info or NULL. err->code can be found in
* @param batchsize size of batch before committing
* @param err to receive error info or NULL. err->code can be found in
* mu-error.h
*
* @return a new MuStore object, or NULL in case of error
*/
MuStore* mu_store_new (const char *path, GError **err)
MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;

View File

@ -113,7 +113,7 @@ test_mu_index (void)
xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR,
"xapian");
store = mu_store_new (xpath, NULL);
store = mu_store_new (xpath, 0, NULL);
g_assert (store);
g_assert_cmpuint (mu_store_count (store), ==, 4);

View File

@ -42,7 +42,7 @@ test_mu_store_new_destroy (void)
g_assert (tmpdir);
err = NULL;
store = mu_store_new (tmpdir, &err);
store = mu_store_new (tmpdir, 12345, &err);
g_assert (store);
g_assert (err == NULL);
@ -68,7 +68,7 @@ test_mu_store_version (void)
g_assert (tmpdir);
err = NULL;
store = mu_store_new (tmpdir, &err);
store = mu_store_new (tmpdir, 789, &err);
g_assert (store);
g_assert (err == NULL);
@ -94,7 +94,7 @@ test_mu_store_store_and_count (void)
tmpdir = test_mu_common_get_random_tmpdir();
g_assert (tmpdir);
store = mu_store_new (tmpdir, NULL);
store = mu_store_new (tmpdir, 1, NULL);
g_assert (store);
g_assert_cmpuint (0,==,mu_store_count (store));
@ -142,7 +142,7 @@ test_mu_store_store_remove_and_count (void)
tmpdir = test_mu_common_get_random_tmpdir();
g_assert (tmpdir);
store = mu_store_new (tmpdir, NULL);
store = mu_store_new (tmpdir, 0, NULL);
g_assert (store);
g_assert_cmpuint (0,==,mu_store_count (store));