mirror of https://github.com/djcb/mu.git
* <many>: add option to change the batch size for xapian transactions
This commit is contained in:
parent
0b88f86e65
commit
169196498e
|
@ -1,4 +1,4 @@
|
||||||
.TH MU-INDEX 1 "November 2010" "User Manuals"
|
.TH MU-INDEX 1 "January 2011" "User Manuals"
|
||||||
|
|
||||||
.SH NAME
|
.SH NAME
|
||||||
|
|
||||||
|
@ -10,11 +10,11 @@ mu index \- index e-mail messages stored in Maildirs
|
||||||
|
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
|
|
||||||
\fBmu index\fR is the \fBmu\fR sub-command for scanning the contents of
|
\fBmu index\fR is the \fBmu\fR command for scanning the contents of Maildir
|
||||||
Maildir directories and storing the results in a Xapian database which can
|
directories and storing the results in a Xapian database. The data can then be
|
||||||
then be searched using
|
queried using
|
||||||
.BR mu-find(1)
|
.BR mu-find(1)
|
||||||
\.
|
\.
|
||||||
|
|
||||||
.B index
|
.B index
|
||||||
understands Maildirs as defined by Daniel Bernstein for qmail(7). In addition,
|
understands Maildirs as defined by Daniel Bernstein for qmail(7). In addition,
|
||||||
|
@ -34,17 +34,14 @@ with spam-messages.
|
||||||
|
|
||||||
The first run of \fBmu index\fR may take a few minutes if you have a lot of
|
The first run of \fBmu index\fR may take a few minutes if you have a lot of
|
||||||
mail (ten thousands of messages). Fortunately, such a full scan needs to be
|
mail (ten thousands of messages). Fortunately, such a full scan needs to be
|
||||||
done only once, after that it suffices to index the changes, which goes much
|
done only once; after that it suffices to index the changes, which goes much
|
||||||
faster. Also note that a substantial amount of the time goes to printing the
|
faster. See the 'Note on performance' below for more information.
|
||||||
progress information; if you turn that off (with \fB\-q\fR or
|
|
||||||
\fB\-\-quiet\fR), it goes a lot faster. See the 'Note on performance' below
|
|
||||||
for more information.
|
|
||||||
|
|
||||||
The optional phase two of the indexing-process is the removal of messages from
|
The optional 'phase two' of the indexing-process is the removal of messages
|
||||||
the database for which there is no longer a corresponding file in the
|
from the database for which there is no longer a corresponding file in the
|
||||||
Maildir. If you do not want this, you can use \fB\-n\fR, \fB\-\-nocleanup\fR.
|
Maildir. If you do not want this, you can use \fB\-n\fR, \fB\-\-nocleanup\fR.
|
||||||
|
|
||||||
When \fBmu index\fR catches on of the signals \fBSIGINT\fR, \fBSIGHUP\fR or
|
When \fBmu index\fR catches one of the signals \fBSIGINT\fR, \fBSIGHUP\fR or
|
||||||
\fBSIGTERM\fR (e.g,, when you press Ctrl-C during the indexing process), it
|
\fBSIGTERM\fR (e.g,, when you press Ctrl-C during the indexing process), it
|
||||||
tries to shutdown gracefully; it tries to save and commit data, and close the
|
tries to shutdown gracefully; it tries to save and commit data, and close the
|
||||||
database etc. If it receives another signal (e.g,, when pressing Ctrl-C once
|
database etc. If it receives another signal (e.g,, when pressing Ctrl-C once
|
||||||
|
@ -52,16 +49,14 @@ more), \fBmu index\fR will terminate immediately.
|
||||||
|
|
||||||
.SH OPTIONS
|
.SH OPTIONS
|
||||||
|
|
||||||
Note, some of the important options are described in the \fBmu(1)\fR man-page
|
Note, some of the general options are described in the \fBmu(1)\fR man-page
|
||||||
and not here, as they apply to multiple mu-commands.
|
and not here, as they apply to multiple mu commands.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-m\fR, \fB\-\-maildir\fR=\fI<maildir>\fR
|
\fB\-m\fR, \fB\-\-maildir\fR=\fI<maildir>\fR
|
||||||
starts searching at \fI<maildir>\fR. By default, \fBmu\fR uses whatever the
|
starts searching at \fI<maildir>\fR. By default, \fBmu\fR uses whatever the
|
||||||
\fBMAILDIR\fR environment variable is set to; if that is not set, it tries
|
\fBMAILDIR\fR environment variable is set to; if it is not set, it tries
|
||||||
\fI~/Maildir\fR \. In either case, the path must be \fBabsolute\fR.
|
\fI~/Maildir\fR. See the note on mixing sub-maildirs below.
|
||||||
|
|
||||||
Also please see the note on mixing sub-maildirs below.
|
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-reindex\fR
|
\fB\-\-reindex\fR
|
||||||
|
@ -83,23 +78,25 @@ messages (using \fB\-\-maildir\fR). For this reason, it is necessary to run
|
||||||
format. \fBmu index\fR will issue a warning about this.
|
format. \fBmu index\fR will issue a warning about this.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-autoupgrade\fR automatically use \fB\-y\fR, \fB\-\-empty\fR
|
\fB\-\-autoupgrade\fR
|
||||||
|
automatically use \fB\-y\fR, \fB\-\-empty\fR
|
||||||
when \fBmu\fR notices that the database version is not up-to-date. This option
|
when \fBmu\fR notices that the database version is not up-to-date. This option
|
||||||
is for use in cron scripts and the like, so they won't require any user
|
is for use in cron scripts and the like, so they won't require any user
|
||||||
interaction, even when mu introduces a new database version.
|
interaction, even when mu introduces a new database version.
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
|
\fB\-\-xbatchsize\fR=\fI<batch size>\fR
|
||||||
|
set the maximum number of messages to process in a single Xapian
|
||||||
|
transaction. In practice, this option is only useful if you find that \fBmu\fR
|
||||||
|
is running out of memory while indexing; in that case, you can set the batch
|
||||||
|
size to (for example) 1000, which will reduce memory consumption, but also
|
||||||
|
reduce performance.
|
||||||
|
|
||||||
.B NOTE:
|
.B NOTE:
|
||||||
It is not a good idea to run multiple instances of
|
It is generally not a good idea to run multiple instances of \fBmu index\fR
|
||||||
.B mu index
|
|
||||||
concurrently. No data loss should occur, but one or more of the instances may
|
concurrently. No data loss should occur, but one or more of the instances may
|
||||||
experience errors due to database locks.
|
experience errors due to database locks.
|
||||||
|
|
||||||
Also note that, before indexing is completed, searches for messages may fail,
|
|
||||||
even if they have already been indexed, as some of the esssential database
|
|
||||||
information will only be written in batches during the indexing process.
|
|
||||||
|
|
||||||
Furthermore, it is not recommended tot mix maildirs and sub-maildirs within
|
Furthermore, it is not recommended tot mix maildirs and sub-maildirs within
|
||||||
the hierarchy in the same database; for example, it's better not to index both
|
the hierarchy in the same database; for example, it's better not to index both
|
||||||
with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as
|
with \fB\-\-maildir\fR=~/MyMaildir and \fB\-\-maildir\fR=~/MyMaildir/foo, as
|
||||||
|
@ -179,6 +176,4 @@ Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
|
|
||||||
.SH "SEE ALSO"
|
.SH "SEE ALSO"
|
||||||
|
|
||||||
.BR maildir(5)
|
.BR maildir(5) mu(1) mu-find(1)
|
||||||
.BR mu(1)
|
|
||||||
.BR mu-find(1)
|
|
||||||
|
|
|
@ -27,10 +27,28 @@
|
||||||
|
|
||||||
G_BEGIN_DECLS
|
G_BEGIN_DECLS
|
||||||
|
|
||||||
|
enum _MuConfigCmd {
|
||||||
|
MU_CONFIG_CMD_INDEX,
|
||||||
|
MU_CONFIG_CMD_FIND,
|
||||||
|
MU_CONFIG_CMD_CLEANUP,
|
||||||
|
MU_CONFIG_CMD_MKDIR,
|
||||||
|
MU_CONFIG_CMD_VIEW,
|
||||||
|
MU_CONFIG_CMD_EXTRACT,
|
||||||
|
MU_CONFIG_CMD_NONE,
|
||||||
|
|
||||||
|
MU_CONFIG_CMD_UNKNOWN
|
||||||
|
};
|
||||||
|
typedef enum _MuConfigCmd MuConfigCmd;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* struct with all configuration options for mu; it will be filled
|
/* struct with all configuration options for mu; it will be filled
|
||||||
* from the config file, and/or command line arguments */
|
* from the config file, and/or command line arguments */
|
||||||
|
|
||||||
struct _MuConfigOptions {
|
struct _MuConfigOptions {
|
||||||
|
|
||||||
|
MuConfigCmd cmd; /* the command, or MU_CONFIG_CMD_NONE */
|
||||||
|
const char *cmdstr; /* cmd string, for user info */
|
||||||
|
|
||||||
/* general options */
|
/* general options */
|
||||||
gboolean quiet; /* don't give any output */
|
gboolean quiet; /* don't give any output */
|
||||||
|
@ -47,6 +65,9 @@ struct _MuConfigOptions {
|
||||||
gboolean rebuild; /* empty the database before indexing */
|
gboolean rebuild; /* empty the database before indexing */
|
||||||
gboolean autoupgrade; /* automatically upgrade db
|
gboolean autoupgrade; /* automatically upgrade db
|
||||||
* when needed */
|
* when needed */
|
||||||
|
int xbatchsize; /* batchsize for xapian commits, or 0 for default
|
||||||
|
* */
|
||||||
|
|
||||||
/* options for querying */
|
/* options for querying */
|
||||||
gboolean xquery; /* give the Xapian query instead of
|
gboolean xquery; /* give the Xapian query instead of
|
||||||
search results */
|
search results */
|
||||||
|
|
|
@ -41,14 +41,14 @@ struct _MuIndex {
|
||||||
};
|
};
|
||||||
|
|
||||||
MuIndex*
|
MuIndex*
|
||||||
mu_index_new (const char *xpath, GError **err)
|
mu_index_new (const char *xpath, guint xbatchsize, GError **err)
|
||||||
{
|
{
|
||||||
MuIndex *index;
|
MuIndex *index;
|
||||||
|
|
||||||
g_return_val_if_fail (xpath, NULL);
|
g_return_val_if_fail (xpath, NULL);
|
||||||
|
|
||||||
index = g_new0 (MuIndex, 1);
|
index = g_new0 (MuIndex, 1);
|
||||||
index->_xapian = mu_store_new (xpath, err);
|
index->_xapian = mu_store_new (xpath, xbatchsize, err);
|
||||||
|
|
||||||
if (!index->_xapian) {
|
if (!index->_xapian) {
|
||||||
g_warning ("%s: failed to open xapian store (%s)",
|
g_warning ("%s: failed to open xapian store (%s)",
|
||||||
|
|
|
@ -51,7 +51,7 @@ typedef struct _MuIndexStats MuIndexStats;
|
||||||
*
|
*
|
||||||
* @return a new MuIndex instance, or NULL in case of error
|
* @return a new MuIndex instance, or NULL in case of error
|
||||||
*/
|
*/
|
||||||
MuIndex* mu_index_new (const char* muhome, GError **err)
|
MuIndex* mu_index_new (const char* muhome, guint batchsize, GError **err)
|
||||||
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -32,8 +32,8 @@
|
||||||
#include "mu-str.h"
|
#include "mu-str.h"
|
||||||
#include "mu-msg-flags.h"
|
#include "mu-msg-flags.h"
|
||||||
|
|
||||||
/* number of new messages after which we commit to the database */
|
/* by default, use transactions of 30000 messages */
|
||||||
#define MU_STORE_TRX_SIZE 6666
|
#define MU_STORE_DEFAULT_TRX_SIZE 30000
|
||||||
|
|
||||||
/* http://article.gmane.org/gmane.comp.search.xapian.general/3656 */
|
/* http://article.gmane.org/gmane.comp.search.xapian.general/3656 */
|
||||||
#define MU_STORE_MAX_TERM_LENGTH 240
|
#define MU_STORE_MAX_TERM_LENGTH 240
|
||||||
|
@ -47,6 +47,7 @@ struct _MuStore {
|
||||||
bool _in_transaction;
|
bool _in_transaction;
|
||||||
int _processed;
|
int _processed;
|
||||||
size_t _trx_size;
|
size_t _trx_size;
|
||||||
|
guint _batchsize; /* batch size of a xapian transaction */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -115,7 +116,7 @@ check_version (MuStore *store)
|
||||||
}
|
}
|
||||||
|
|
||||||
MuStore*
|
MuStore*
|
||||||
mu_store_new (const char* xpath, GError **err)
|
mu_store_new (const char* xpath, guint batchsize, GError **err)
|
||||||
{
|
{
|
||||||
MuStore *store (0);
|
MuStore *store (0);
|
||||||
|
|
||||||
|
@ -131,14 +132,15 @@ mu_store_new (const char* xpath, GError **err)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* keep count of processed docs */
|
/* keep count of processed docs */
|
||||||
store->_trx_size = MU_STORE_TRX_SIZE;
|
|
||||||
store->_in_transaction = false;
|
store->_in_transaction = false;
|
||||||
store->_processed = 0;
|
store->_processed = 0;
|
||||||
|
store->_trx_size = batchsize ? batchsize : MU_STORE_DEFAULT_TRX_SIZE;
|
||||||
|
|
||||||
add_synonyms (store);
|
add_synonyms (store);
|
||||||
|
|
||||||
MU_WRITE_LOG ("%s: opened %s", __FUNCTION__, xpath);
|
MU_WRITE_LOG ("%s: opened %s (batch size: %u)",
|
||||||
|
__FUNCTION__, xpath, store->_trx_size);
|
||||||
|
|
||||||
return store;
|
return store;
|
||||||
|
|
||||||
} MU_XAPIAN_CATCH_BLOCK_G_ERROR(err,MU_ERROR_XAPIAN);
|
} MU_XAPIAN_CATCH_BLOCK_G_ERROR(err,MU_ERROR_XAPIAN);
|
||||||
|
|
|
@ -36,12 +36,13 @@ typedef struct _MuStore MuStore;
|
||||||
* create a new Xapian store, a place to store documents
|
* create a new Xapian store, a place to store documents
|
||||||
*
|
*
|
||||||
* @param path the path to the database
|
* @param path the path to the database
|
||||||
* @param err to receive error info or NULL. err->code can be found in
|
* @param batchsize size of batch before committing
|
||||||
|
* @param err to receive error info or NULL. err->code can be found in
|
||||||
* mu-error.h
|
* mu-error.h
|
||||||
*
|
*
|
||||||
* @return a new MuStore object, or NULL in case of error
|
* @return a new MuStore object, or NULL in case of error
|
||||||
*/
|
*/
|
||||||
MuStore* mu_store_new (const char *path, GError **err)
|
MuStore* mu_store_new (const char *path, guint batchsize, GError **err)
|
||||||
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ test_mu_index (void)
|
||||||
xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR,
|
xpath = g_strdup_printf ("%s%c%s", muhome, G_DIR_SEPARATOR,
|
||||||
"xapian");
|
"xapian");
|
||||||
|
|
||||||
store = mu_store_new (xpath, NULL);
|
store = mu_store_new (xpath, 0, NULL);
|
||||||
g_assert (store);
|
g_assert (store);
|
||||||
|
|
||||||
g_assert_cmpuint (mu_store_count (store), ==, 4);
|
g_assert_cmpuint (mu_store_count (store), ==, 4);
|
||||||
|
|
|
@ -42,7 +42,7 @@ test_mu_store_new_destroy (void)
|
||||||
g_assert (tmpdir);
|
g_assert (tmpdir);
|
||||||
|
|
||||||
err = NULL;
|
err = NULL;
|
||||||
store = mu_store_new (tmpdir, &err);
|
store = mu_store_new (tmpdir, 12345, &err);
|
||||||
g_assert (store);
|
g_assert (store);
|
||||||
g_assert (err == NULL);
|
g_assert (err == NULL);
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ test_mu_store_version (void)
|
||||||
g_assert (tmpdir);
|
g_assert (tmpdir);
|
||||||
|
|
||||||
err = NULL;
|
err = NULL;
|
||||||
store = mu_store_new (tmpdir, &err);
|
store = mu_store_new (tmpdir, 789, &err);
|
||||||
g_assert (store);
|
g_assert (store);
|
||||||
g_assert (err == NULL);
|
g_assert (err == NULL);
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ test_mu_store_store_and_count (void)
|
||||||
tmpdir = test_mu_common_get_random_tmpdir();
|
tmpdir = test_mu_common_get_random_tmpdir();
|
||||||
g_assert (tmpdir);
|
g_assert (tmpdir);
|
||||||
|
|
||||||
store = mu_store_new (tmpdir, NULL);
|
store = mu_store_new (tmpdir, 1, NULL);
|
||||||
g_assert (store);
|
g_assert (store);
|
||||||
|
|
||||||
g_assert_cmpuint (0,==,mu_store_count (store));
|
g_assert_cmpuint (0,==,mu_store_count (store));
|
||||||
|
@ -142,7 +142,7 @@ test_mu_store_store_remove_and_count (void)
|
||||||
tmpdir = test_mu_common_get_random_tmpdir();
|
tmpdir = test_mu_common_get_random_tmpdir();
|
||||||
g_assert (tmpdir);
|
g_assert (tmpdir);
|
||||||
|
|
||||||
store = mu_store_new (tmpdir, NULL);
|
store = mu_store_new (tmpdir, 0, NULL);
|
||||||
g_assert (store);
|
g_assert (store);
|
||||||
|
|
||||||
g_assert_cmpuint (0,==,mu_store_count (store));
|
g_assert_cmpuint (0,==,mu_store_count (store));
|
||||||
|
|
Loading…
Reference in New Issue