From eb9bfbb1ca3c4c6fcfb63a41ee531076771631f4 Mon Sep 17 00:00:00 2001 From: Nicolas Avrutin Date: Sun, 8 Jul 2018 23:04:24 -0400 Subject: [PATCH] Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. Perform threading calculation on related set instead of entire result. The current threading algorithm is applied to the entire result of a query, even if maxnum is specified, and then the result of the threading algorithm is truncated to maxnum. The improves threading results by returning the entire thread even when only a single message makes it into the top maxnum results. This commit applies the threading algorithm to the related message set of the maxnum-truncated query result instead of to the entire query result. For a given set of messages, the set of messages which will share threads with any of the original messages is exactly the related message sets. Put another way, either any messages returned by the original query but removed by the maxnum truncation will also be returned by the related message query, or they would not have been needed anyway because they would not be members of any visible thread. To maintain backward compatibility and allow threading to be used without including related messages, the related message set is found for the threading calculation, but any messages which would not have matched the original query are then pruned, resulting in a superset of the truncated query, but a subset of the untruncated query. This does not improve (or degrade) the run time of a threading calculation when maxnum is not set, but significant improves it when maxnum is set by making it scale (roughly) linearly in terms of maxnum. On a maildir with ~200k messages and maxnum set to 500 (the default), the run time of a threading calculation is lowered from ~1m to ~0.1s. --- lib/mu-query.cc | 60 ++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/lib/mu-query.cc b/lib/mu-query.cc index fd51fc7f..a7e0584f 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -369,15 +369,23 @@ get_related_query (MuMsgIter *iter, GHashTable **orig_set) static void -include_related (MuQuery *self, MuMsgIter **iter, int maxnum, - MuMsgFieldId sortfieldid, MuQueryFlags flags) +get_related_messages (MuQuery *self, MuMsgIter **iter, int maxnum, + MuMsgFieldId sortfieldid, MuQueryFlags flags, + Xapian::Query orig_query) { GHashTable *orig_set; Xapian::Enquire enq (self->db()); MuMsgIter *rel_iter; + const bool inc_related = flags & MU_QUERY_FLAG_INCLUDE_RELATED; orig_set = NULL; - enq.set_query(get_related_query (*iter, &orig_set)); + Xapian::Query new_query = get_related_query (*iter, &orig_set); + /* If related message are not desired, filter out messages which would not + have matched the original query. + */ + if (!inc_related) + new_query = Xapian::Query (Xapian::Query::OP_AND, orig_query, new_query); + enq.set_query(new_query); enq.set_cutoff(0,0); rel_iter= mu_msg_iter_new ( @@ -410,11 +418,12 @@ mu_query_run (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid, sortfieldid == MU_MSG_FIELD_ID_NONE, NULL); try { - MuMsgIter *iter; - MuQueryFlags first_flags; - const auto inc_related = flags & MU_QUERY_FLAG_INCLUDE_RELATED; - const auto descending = flags & MU_QUERY_FLAG_DESCENDING; - const auto raw = flags & MU_QUERY_FLAG_RAW; + MuMsgIter *iter; + MuQueryFlags first_flags; + const bool threads = flags & MU_QUERY_FLAG_THREADS; + const bool inc_related = flags & MU_QUERY_FLAG_INCLUDE_RELATED; + const bool descending = flags & MU_QUERY_FLAG_DESCENDING; + const bool raw = flags & MU_QUERY_FLAG_RAW; Xapian::Enquire enq (get_enquire(self, searchexpr, sortfieldid, descending, raw, err)); @@ -426,31 +435,32 @@ mu_query_run (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid, /* get the 'real' maxnum if it was specified as < 0 */ maxnum = maxnum < 0 ? self->db().get_doccount() : maxnum; - /* if we do a include-related query, it's wasted - * effort to calculate threads already in the first - * query since we can do it in the second one + /* Calculating threads involves two queries, so do the calculation only in + * the second query instead of in both. */ - if (inc_related) + if (threads) first_flags = (MuQueryFlags)(flags & ~MU_QUERY_FLAG_THREADS); else first_flags = flags; - - iter = mu_msg_iter_new ( + /* Perform the initial query, returning up to max num results. + */ + iter = mu_msg_iter_new ( reinterpret_cast(&enq), maxnum, - /* with inc_related, we do the sorting in the - * second query - */ - inc_related ? MU_MSG_FIELD_ID_NONE : sortfieldid, + sortfieldid, msg_iter_flags (first_flags), err); - /* - * if we want related messages, do a second query, - * based on the message ids / refs of the first one - * */ - if (inc_related) - include_related (self, &iter, maxnum, sortfieldid, - flags); + /* If we want threads or related messages, find related messages using a + * second query based on the message ids / refs of the first query's result. + * Do this even if we don't want to include related messages in the final + * result so we can apply the threading algorithm to the related message set + * of a maxnum-sized result instead of the unbounded result of the first + * query. If threads are desired but related message are not, we will remove + * the undesired related messages later. + */ + if(threads||inc_related) + get_related_messages (self, &iter, maxnum, sortfieldid, flags, + enq.get_query()); if (err && *err && (*err)->code == MU_ERROR_XAPIAN_MODIFIED) { g_clear_error (err);