mu/lib/mu-query.cc

513 lines
14 KiB
C++
Raw Normal View History

/*
2017-10-24 21:57:57 +02:00
** Copyright (C) 2008-2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <stdexcept>
#include <string>
#include <cctype>
#include <cstring>
2017-10-24 21:57:57 +02:00
#include <sstream>
2017-10-24 21:57:57 +02:00
#include <stdlib.h>
#include <xapian.h>
#include <glib/gstdio.h>
2010-08-25 20:46:16 +02:00
#include "mu-query.h"
#include "mu-msg-fields.h"
2010-08-25 20:29:53 +02:00
#include "mu-msg-iter.h"
#include "mu-util.h"
2010-11-22 23:44:18 +01:00
#include "mu-str.h"
#include "mu-date.h"
2017-10-24 21:57:57 +02:00
#include <parser/proc-iface.hh>
#include <parser/utils.hh>
#include <parser/xapian.hh>
2017-10-24 21:57:57 +02:00
struct MuProc: public Mux::ProcIface {
2017-10-24 21:57:57 +02:00
MuProc (const Xapian::Database& db): db_{db} {}
2017-10-24 21:57:57 +02:00
static MuMsgFieldId field_id (const std::string& field) {
2017-10-24 21:57:57 +02:00
if (field.empty())
return MU_MSG_FIELD_ID_NONE;
2017-10-24 21:57:57 +02:00
MuMsgFieldId id = mu_msg_field_id_from_name (field.c_str(), FALSE);
if (id != MU_MSG_FIELD_ID_NONE)
return id;
else if (field.length() == 1)
2017-10-24 21:57:57 +02:00
return mu_msg_field_id_from_shortcut (field[0], FALSE);
else
return MU_MSG_FIELD_ID_NONE;
}
2017-10-24 21:57:57 +02:00
std::string
process_value (const std::string& field,
const std::string& value) const override {
const auto id = field_id (field);
if (id == MU_MSG_FIELD_ID_NONE)
return value;
switch(id) {
case MU_MSG_FIELD_ID_PRIO: {
if (!value.empty())
return std::string(1, value[0]);
} break;
case MU_MSG_FIELD_ID_FLAGS: {
const auto flag = mu_flag_char_from_name (value.c_str());
if (flag)
return std::string(1, tolower(flag));
} break;
default:
break;
}
2017-10-24 21:57:57 +02:00
return value; // XXX prio/flags, etc. alias
}
2017-10-24 21:57:57 +02:00
void add_field (std::vector<FieldInfo>& fields, MuMsgFieldId id) const {
2017-10-24 21:57:57 +02:00
const auto shortcut = mu_msg_field_shortcut(id);
if (!shortcut)
return; // can't be searched
2017-10-24 21:57:57 +02:00
const auto name = mu_msg_field_name (id);
const auto pfx = mu_msg_field_xapian_prefix (id);
2017-10-24 21:57:57 +02:00
if (!name || !pfx)
return;
fields.push_back ({{name}, {pfx},
2017-10-30 07:23:51 +01:00
(bool)mu_msg_field_xapian_index(id),
id});
}
2017-10-24 21:57:57 +02:00
std::vector<FieldInfo>
process_field (const std::string& field) const override {
std::vector<FieldInfo> fields;
if (field == "contact" || field == "recip") { // multi fields
add_field (fields, MU_MSG_FIELD_ID_TO);
add_field (fields, MU_MSG_FIELD_ID_CC);
add_field (fields, MU_MSG_FIELD_ID_BCC);
if (field == "contact")
add_field (fields, MU_MSG_FIELD_ID_FROM);
} else if (field == "") {
add_field (fields, MU_MSG_FIELD_ID_TO);
add_field (fields, MU_MSG_FIELD_ID_CC);
add_field (fields, MU_MSG_FIELD_ID_BCC);
add_field (fields, MU_MSG_FIELD_ID_FROM);
add_field (fields, MU_MSG_FIELD_ID_SUBJECT);
add_field (fields, MU_MSG_FIELD_ID_BODY_TEXT);
} else {
const auto id = field_id (field.c_str());
if (id != MU_MSG_FIELD_ID_NONE)
add_field (fields, id);
}
2017-10-24 21:57:57 +02:00
return fields;
}
2017-10-24 21:57:57 +02:00
bool is_range_field (const std::string& field) const override {
const auto id = field_id (field.c_str());
if (id == MU_MSG_FIELD_ID_NONE)
return false;
2017-10-24 21:57:57 +02:00
else
return mu_msg_field_is_range_field (id);
}
2017-10-24 21:57:57 +02:00
Range process_range (const std::string& field, const std::string& lower,
const std::string& upper) const override {
2017-10-24 21:57:57 +02:00
const auto id = field_id (field.c_str());
if (id == MU_MSG_FIELD_ID_NONE)
return { lower, upper };
2017-10-24 21:57:57 +02:00
std::string l2 = lower;
std::string u2 = upper;
2017-10-24 21:57:57 +02:00
if (id == MU_MSG_FIELD_ID_DATE) {
l2 = Mux::date_to_time_t_string (lower, true);
u2 = Mux::date_to_time_t_string (upper, false);
} else if (id == MU_MSG_FIELD_ID_SIZE) {
l2 = Mux::size_to_string (lower, true);
u2 = Mux::size_to_string (upper, false);
}
2017-10-24 21:57:57 +02:00
return { l2, u2 };
}
std::vector<std::string>
process_regex (const std::string& field, const std::regex& rx) const override {
2017-10-24 21:57:57 +02:00
const auto id = field_id (field.c_str());
if (id == MU_MSG_FIELD_ID_NONE)
return {};
2017-10-24 21:57:57 +02:00
char pfx[] = { mu_msg_field_xapian_prefix(id), '\0' };
2017-10-24 21:57:57 +02:00
std::vector<std::string> terms;
for (auto it = db_.allterms_begin(pfx); it != db_.allterms_end(pfx); ++it) {
if (std::regex_search((*it).c_str() + 1, rx)) // avoid copy
terms.push_back(*it);
}
2017-10-24 21:57:57 +02:00
return terms;
}
2017-10-24 21:57:57 +02:00
const Xapian::Database& db_;
};
2017-10-24 21:58:32 +02:00
struct _MuQuery {
2017-10-24 21:57:57 +02:00
public:
_MuQuery (MuStore *store): _store(mu_store_ref(store)) {}
~_MuQuery () { mu_store_unref (_store); }
Xapian::Database& db() const {
2017-10-24 21:57:57 +02:00
const auto db = reinterpret_cast<Xapian::Database*>
(mu_store_get_read_only_database (_store));
if (!db)
throw std::runtime_error ("no database");
return *db;
}
private:
MuStore *_store;
};
static const Xapian::Query
get_query (MuQuery *mqx, const char* searchexpr, bool raw, GError **err) try {
2017-10-24 21:57:57 +02:00
Mux::WarningVec warns;
const auto tree = Mux::parse (searchexpr, warns,
std::make_unique<MuProc>(mqx->db()));
for (const auto w: warns)
std::cerr << w << std::endl;
return Mux::xapian_query (tree);
} catch (...) {
mu_util_g_set_error (err,MU_ERROR_XAPIAN_QUERY,
"parse error in query");
throw;
}
2010-08-25 20:46:16 +02:00
MuQuery*
mu_query_new (MuStore *store, GError **err)
{
g_return_val_if_fail (store, NULL);
if (mu_store_count (store, err) == 0) {
g_set_error (err, MU_ERROR_DOMAIN, MU_ERROR_XAPIAN_IS_EMPTY,
"database is empty");
return 0;
}
try {
return new MuQuery (store);
2012-06-12 10:55:59 +02:00
} MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN (err, MU_ERROR_XAPIAN, 0);
2012-10-23 19:29:24 +02:00
return 0;
}
void
2010-08-25 20:46:16 +02:00
mu_query_destroy (MuQuery *self)
{
try { delete self; } MU_XAPIAN_CATCH_BLOCK;
}
/* this function is for handling the case where a DatabaseModified
* exception is raised. We try to reopen the database, and run the
* query again. */
static MuMsgIter *
try_requery (MuQuery *self, const char* searchexpr, MuMsgFieldId sortfieldid,
int maxnum, MuQueryFlags flags, GError **err)
{
try {
/* let's assume that infinite regression is
* impossible */
self->db().reopen();
MU_WRITE_LOG ("reopening db after modification");
return mu_query_run (self, searchexpr, sortfieldid,
maxnum, flags, err);
} MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN (err, MU_ERROR_XAPIAN, 0);
}
static MuMsgIterFlags
msg_iter_flags (MuQueryFlags flags)
{
MuMsgIterFlags iflags;
iflags = MU_MSG_ITER_FLAG_NONE;
if (flags & MU_QUERY_FLAG_DESCENDING)
iflags |= MU_MSG_ITER_FLAG_DESCENDING;
if (flags & MU_QUERY_FLAG_SKIP_UNREADABLE)
iflags |= MU_MSG_ITER_FLAG_SKIP_UNREADABLE;
if (flags & MU_QUERY_FLAG_SKIP_DUPS)
iflags |= MU_MSG_ITER_FLAG_SKIP_DUPS;
if (flags & MU_QUERY_FLAG_THREADS)
iflags |= MU_MSG_ITER_FLAG_THREADS;
return iflags;
}
2012-08-30 23:22:18 +02:00
static Xapian::Enquire
get_enquire (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid,
bool descending, bool raw, GError **err)
2012-08-30 23:22:18 +02:00
{
Xapian::Enquire enq (self->db());
try {
if (raw)
enq.set_query(Xapian::Query(Xapian::Query(searchexpr)));
else if (!mu_str_is_empty(searchexpr) &&
g_strcmp0 (searchexpr, "\"\"") != 0) /* NULL or "" or """" */
enq.set_query(get_query (self, searchexpr, raw, err));
else/* empty or "" means "matchall" */
enq.set_query(Xapian::Query::MatchAll);
} catch (...) {
mu_util_g_set_error (err, MU_ERROR_XAPIAN_QUERY,
"parse error in query");
throw;
}
2012-08-30 23:22:18 +02:00
enq.set_cutoff(0,0);
return enq;
}
/*
* record all threadids for the messages; also 'orig_set' receives all
* original matches (a map msgid-->docid), so we can make sure the
* originals are not seen as 'duplicates' later (when skipping
* duplicates). We want to favor the originals over the related
* messages, when skipping duplicates.
*/
static GHashTable*
get_thread_ids (MuMsgIter *iter, GHashTable **orig_set)
{
GHashTable *ids;
2016-12-11 17:33:31 +01:00
ids = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify)g_free, NULL);
*orig_set = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify)g_free, NULL);
while (!mu_msg_iter_is_done (iter)) {
char *thread_id, *msgid;
unsigned docid;
/* record the thread id for the message */
if ((thread_id = mu_msg_iter_get_thread_id (iter)))
g_hash_table_insert (ids, thread_id,
GSIZE_TO_POINTER(TRUE));
/* record the original set */
docid = mu_msg_iter_get_docid(iter);
if (docid != 0 && (msgid = mu_msg_iter_get_msgid (iter)))
g_hash_table_insert (*orig_set, msgid,
GSIZE_TO_POINTER(docid));
if (!mu_msg_iter_next (iter))
break;
}
return ids;
}
static Xapian::Query
get_related_query (MuMsgIter *iter, GHashTable **orig_set)
{
GHashTable *hash;
GList *id_list, *cur;
std::vector<Xapian::Query> qvec;
static std::string pfx (1, mu_msg_field_xapian_prefix
(MU_MSG_FIELD_ID_THREAD_ID));
/* orig_set receives the hash msgid->docid of the set of
* original matches */
hash = get_thread_ids (iter, orig_set);
/* id_list now gets a list of all thread-ids seen in the query
* results; either in the Message-Id field or in
* References. */
id_list = g_hash_table_get_keys (hash);
// now, we create a vector with queries for each of the
// thread-ids, which we combine below. This is /much/ faster
// than creating the query as 'query = Query (OR, query)'...
for (cur = id_list; cur; cur = g_list_next(cur))
2013-07-03 20:56:42 +02:00
qvec.push_back (Xapian::Query((std::string
(pfx + (char*)cur->data))));
g_hash_table_destroy (hash);
g_list_free (id_list);
return Xapian::Query (Xapian::Query::OP_OR, qvec.begin(), qvec.end());
}
static void
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
get_related_messages (MuQuery *self, MuMsgIter **iter, int maxnum,
MuMsgFieldId sortfieldid, MuQueryFlags flags,
Xapian::Query orig_query)
{
GHashTable *orig_set;
Xapian::Enquire enq (self->db());
MuMsgIter *rel_iter;
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
const bool inc_related = flags & MU_QUERY_FLAG_INCLUDE_RELATED;
orig_set = NULL;
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
Xapian::Query new_query = get_related_query (*iter, &orig_set);
/* If related message are not desired, filter out messages which would not
have matched the original query.
*/
if (!inc_related)
new_query = Xapian::Query (Xapian::Query::OP_AND, orig_query, new_query);
enq.set_query(new_query);
enq.set_cutoff(0,0);
rel_iter= mu_msg_iter_new (
reinterpret_cast<XapianEnquire*>(&enq),
maxnum,
sortfieldid,
msg_iter_flags (flags),
NULL);
mu_msg_iter_destroy (*iter);
2013-05-04 13:14:58 +02:00
// set the preferred set for the iterator (ie., the set of
// messages not considered to be duplicates) to be the
// original matches -- the matches without considering
// 'related'
mu_msg_iter_set_preferred (rel_iter, orig_set);
g_hash_table_destroy (orig_set);
*iter = rel_iter;
}
2010-08-25 20:29:53 +02:00
MuMsgIter*
mu_query_run (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid,
int maxnum, MuQueryFlags flags, GError **err)
{
g_return_val_if_fail (self, NULL);
g_return_val_if_fail (searchexpr, NULL);
g_return_val_if_fail (mu_msg_field_id_is_valid (sortfieldid) ||
sortfieldid == MU_MSG_FIELD_ID_NONE,
2010-11-23 22:52:37 +01:00
NULL);
try {
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
MuMsgIter *iter;
MuQueryFlags first_flags;
const bool threads = flags & MU_QUERY_FLAG_THREADS;
const bool inc_related = flags & MU_QUERY_FLAG_INCLUDE_RELATED;
const bool descending = flags & MU_QUERY_FLAG_DESCENDING;
const bool raw = flags & MU_QUERY_FLAG_RAW;
2012-12-20 22:37:04 +01:00
Xapian::Enquire enq (get_enquire(self, searchexpr, sortfieldid,
descending, raw, err));
/* when we're doing a 'include-related query', wea're actually
* doing /two/ queries; one to get the initial matches, and
* based on that one to get all messages in threads in those
* matches.
*/
/* get the 'real' maxnum if it was specified as < 0 */
maxnum = maxnum < 0 ? self->db().get_doccount() : maxnum;
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
/* Calculating threads involves two queries, so do the calculation only in
* the second query instead of in both.
*/
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
if (threads)
2017-10-24 21:57:57 +02:00
first_flags = (MuQueryFlags)(flags & ~MU_QUERY_FLAG_THREADS);
else
first_flags = flags;
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
/* Perform the initial query, returning up to max num results.
*/
iter = mu_msg_iter_new (
2012-03-13 23:07:29 +01:00
reinterpret_cast<XapianEnquire*>(&enq),
maxnum,
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
sortfieldid,
msg_iter_flags (first_flags),
err);
Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s.
2018-07-09 05:04:24 +02:00
/* If we want threads or related messages, find related messages using a
* second query based on the message ids / refs of the first query's result.
* Do this even if we don't want to include related messages in the final
* result so we can apply the threading algorithm to the related message set
* of a maxnum-sized result instead of the unbounded result of the first
* query. If threads are desired but related message are not, we will remove
* the undesired related messages later.
*/
if(threads||inc_related)
get_related_messages (self, &iter, maxnum, sortfieldid, flags,
enq.get_query());
if (err && *err && (*err)->code == MU_ERROR_XAPIAN_MODIFIED) {
g_clear_error (err);
return try_requery (self, searchexpr, sortfieldid,
maxnum, flags, err);
} else
return iter;
2011-09-12 19:48:30 +02:00
} MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN (err, MU_ERROR_XAPIAN, 0);
}
2011-09-12 19:48:30 +02:00
char*
2017-10-24 21:57:57 +02:00
mu_query_internal_xapian (MuQuery *self, const char *searchexpr, GError **err)
{
g_return_val_if_fail (self, NULL);
g_return_val_if_fail (searchexpr, NULL);
try {
Xapian::Query query (get_query(self, searchexpr, false, err));
return g_strdup(query.get_description().c_str());
} MU_XAPIAN_CATCH_BLOCK_RETURN(NULL);
}
2017-10-24 21:57:57 +02:00
char*
mu_query_internal (MuQuery *self, const char *searchexpr,
gboolean warn, GError **err)
{
g_return_val_if_fail (self, NULL);
g_return_val_if_fail (searchexpr, NULL);
try {
Mux::WarningVec warns;
const auto tree = Mux::parse (searchexpr, warns,
std::make_unique<MuProc>(self->db()));
std::stringstream ss;
ss << tree;
if (warn) {
for (const auto w: warns)
std::cerr << w << std::endl;
}
return g_strdup(ss.str().c_str());
} MU_XAPIAN_CATCH_BLOCK_RETURN(NULL);
}