* mu-query/mu-msg-iter: when showing related message (--include-related),

favor the ones that were in the original set
This commit is contained in:
djcb 2012-12-28 14:48:00 +02:00
parent a0d8d4f5da
commit 70356a62f5
3 changed files with 107 additions and 42 deletions

View File

@ -30,12 +30,22 @@
#include <string>
#include <set>
#include <map>
#include "mu-util.h"
#include "mu-msg.h"
#include "mu-msg-iter.h"
#include "mu-threader.h"
struct ltstr {
bool operator () (const std::string &s1,
const std::string &s2) const {
return g_strcmp0 (s1.c_str(), s2.c_str()) < 0;
}
};
typedef std::map <std::string, unsigned, ltstr> msgid_docid_map;
class ThreadKeyMaker: public Xapian::KeyMaker {
public:
ThreadKeyMaker (GHashTable *threadinfo): _threadinfo(threadinfo) {}
@ -56,7 +66,7 @@ public:
MuMsgFieldId sortfield, MuMsgIterFlags flags):
_enq(enq), _thread_hash (0), _msg(0), _flags(flags),
_skip_unreadable(flags & MU_MSG_ITER_FLAG_SKIP_UNREADABLE),
_skip_dups (flags & MU_MSG_ITER_FLAG_SKIP_DUPS) {
_skip_dups (flags & MU_MSG_ITER_FLAG_SKIP_DUPS){
bool descending = (flags & MU_MSG_ITER_FLAG_DESCENDING);
bool threads = (flags & MU_MSG_ITER_FLAG_THREADS);
@ -71,24 +81,22 @@ public:
if (threads) {
_matches.fetch();
_cursor = _matches.begin();
{ // temporarily turn-off skipping dups
_skip_dups = FALSE;
_thread_hash = mu_threader_calculate
(this, _matches.size(), sortfield, descending);
_skip_dups = (flags & MU_MSG_ITER_FLAG_SKIP_DUPS);
}
// NOTE: temporarily turn-off skipping duplicates, since we
// need threadinfo for *all*
_skip_dups = false;
_thread_hash = mu_threader_calculate
(this, _matches.size(), sortfield, descending);
_skip_dups = (flags & MU_MSG_ITER_FLAG_SKIP_DUPS);
ThreadKeyMaker keymaker(_thread_hash);
enq.set_sort_by_key (&keymaker, false);
_matches = _enq.get_mset (0, maxnum);
} else if (sortfield != MU_MSG_FIELD_ID_NONE) {
enq.set_sort_by_value ((Xapian::valueno)sortfield,
descending);
_matches = _enq.get_mset (0, maxnum);
_cursor = _matches.begin();
}
_cursor = _matches.begin();
}
@ -120,12 +128,30 @@ public:
bool looks_like_dup () const {
try {
const Xapian::Document doc (cursor().get_document());
const std::string msg_uid
(doc.get_value(MU_MSG_FIELD_ID_MSGID));
if (_msg_uid_set.find (msg_uid) != _msg_uid_set.end()) {
const std::string msgid (doc.get_value(MU_MSG_FIELD_ID_MSGID));
unsigned docid (doc.get_docid());
if (msgid.empty())
return false;
// is this message in the preferred map? if
// so, it's not a duplicate, otherwise, it
// isn't
msgid_docid_map::const_iterator pref_iter (_preferred_map.find (msgid));
if (pref_iter != _preferred_map.end()) {
//std::cerr << "in the set!" << std::endl;
if ((*pref_iter).second == docid)
return false; // in the set: not a dup!
else
return true;
}
// otherwise, simply check if we've already seen this message-id,
// and, if so, it's considered a dup
if (_msg_uid_set.find (msgid) != _msg_uid_set.end()) {
return true;
} else {
_msg_uid_set.insert (msg_uid);
_msg_uid_set.insert (msgid);
return false;
}
} catch (...) {
@ -133,6 +159,17 @@ public:
}
}
static void each_preferred (const char *msgid, gpointer docidp, msgid_docid_map *preferred_map) {
(*preferred_map)[msgid] = GPOINTER_TO_SIZE(docidp);
}
void set_preferred_map (GHashTable *preferred_hash) {
if (!preferred_hash)
_preferred_map.clear();
else
g_hash_table_foreach (preferred_hash,
(GHFunc)each_preferred, &_preferred_map);
}
bool skip_dups () const { return _skip_dups; }
bool skip_unreadable () const { return _skip_unreadable; }
@ -147,15 +184,15 @@ private:
MuMsgIterFlags _flags;
struct ltstr {
bool operator () (const std::string &s1,
const std::string &s2) const {
return g_strcmp0 (s1.c_str(), s2.c_str()) < 0;
}
};
mutable std::set <std::string, ltstr> _msg_uid_set;
bool _skip_unreadable;
bool _skip_unreadable, _skip_dups;
// the 'preferred map' (msgid->docid) is used when checking
// for duplicates; if a message is in the preferred map, it
// will not be excluded (but other messages with the same
// msgid will)
msgid_docid_map _preferred_map;
bool _skip_dups;
};
@ -212,21 +249,17 @@ mu_msg_iter_destroy (MuMsgIter *iter)
try { delete iter; } MU_XAPIAN_CATCH_BLOCK;
}
void
mu_msg_iter_set_skip_duplicates (MuMsgIter *iter, gboolean skip_duplicates,
GHashTable *preferred_set)
mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash)
{
g_return_if_fail (iter);
g_return_if_fail (!skip_duplicates && preferred_set);
iter->set_preferred_map (preferred_hash);
}
MuMsg*
mu_msg_iter_get_msg_floating (MuMsgIter *iter)
{
@ -306,7 +339,7 @@ mu_msg_iter_is_done (MuMsgIter *iter)
/* hmmm.... is it impossible to get a 0 docid, or just very improbable? */
unsigned int
unsigned
mu_msg_iter_get_docid (MuMsgIter *iter)
{
g_return_val_if_fail (iter, (unsigned int)-1);
@ -368,8 +401,6 @@ mu_msg_iter_get_thread_id (MuMsgIter *iter)
}
const MuMsgIterThreadInfo*
mu_msg_iter_get_thread_info (MuMsgIter *iter)
{
@ -385,7 +416,7 @@ mu_msg_iter_get_thread_info (MuMsgIter *iter)
(iter->thread_hash(), GUINT_TO_POINTER(docid));
if (!ti)
g_printerr ("no ti for %u\n", docid);
g_warning ("no ti for %u\n", docid);
return ti;

View File

@ -51,8 +51,6 @@ enum _MuMsgIterFlags {
};
typedef unsigned MuMsgIterFlags;
/**
* create a new MuMsgIter -- basically, an iterator over the search
* results
@ -128,6 +126,18 @@ MuMsg* mu_msg_iter_get_msg_floating (MuMsgIter *iter)
/**
* Provide a preferred_hash, which is a hashtable msgid->docid to
* indicate the messages which should /not/ be seen as duplicates.
*
* @param iter a valid MuMsgIter iterator
* @param preferred_hash a hashtable msgid->docid of message /not/ to
* mark as duplicates, or NULL
*/
void mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash);
/**
* get the document id for the current message
*

View File

@ -397,20 +397,34 @@ get_enquire (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid,
}
/*
* record all threadids for the messages
* record all threadids for the messages; also 'orig_set' receives all
* original matches (a map msgid-->docid), so we can make sure the
* originals are not seen as 'duplicates' later (when skipping
* duplicates). We want to favor the originals over the related
* messages, when skipping duplicates.
*/
static GHashTable*
get_thread_ids (MuMsgIter *iter)
get_thread_ids (MuMsgIter *iter, GHashTable **orig_set)
{
GHashTable *ids;
ids = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify)g_free, NULL);
ids = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify)g_free, NULL);
*orig_set = g_hash_table_new_full (g_str_hash, g_str_equal,
(GDestroyNotify)g_free, NULL);
while (!mu_msg_iter_is_done (iter)) {
const char *thread_id;
const char *thread_id, *msgid;
unsigned docid;
/* record the thread id for the message */
if ((thread_id = mu_msg_iter_get_thread_id (iter)))
g_hash_table_insert (ids, g_strdup (thread_id),
GSIZE_TO_POINTER(TRUE));
/* record the original set */
docid = mu_msg_iter_get_docid(iter);
if (docid != 0 && (msgid = mu_msg_iter_get_msgid (iter)))
g_hash_table_insert (*orig_set, g_strdup (msgid),
GSIZE_TO_POINTER(docid));
if (!mu_msg_iter_next (iter))
break;
}
@ -420,7 +434,7 @@ get_thread_ids (MuMsgIter *iter)
static Xapian::Query
get_related_query (MuMsgIter *iter)
get_related_query (MuMsgIter *iter, GHashTable **orig_set)
{
GHashTable *hash;
GList *id_list, *cur;
@ -428,7 +442,9 @@ get_related_query (MuMsgIter *iter)
static std::string pfx (1, mu_msg_field_xapian_prefix
(MU_MSG_FIELD_ID_THREAD_ID));
hash = get_thread_ids (iter);
/* orig_set receives the hash msgid->docid of the set of
* original matches */
hash = get_thread_ids (iter, orig_set);
/* id_list now gets a list of all thread-ids seen in the query
* results; either in the Message-Id field or in
* References. */
@ -451,10 +467,12 @@ static void
include_related (MuQuery *self, MuMsgIter **iter, int maxnum,
MuMsgFieldId sortfieldid, MuQueryFlags flags)
{
GHashTable *orig_set;
Xapian::Enquire enq (self->db());
MuMsgIter *rel_iter;
enq.set_query(get_related_query (*iter));
orig_set = NULL;
enq.set_query(get_related_query (*iter, &orig_set));
enq.set_cutoff(0,0);
rel_iter= mu_msg_iter_new (
@ -465,6 +483,12 @@ include_related (MuQuery *self, MuMsgIter **iter, int maxnum,
NULL);
mu_msg_iter_destroy (*iter);
// set the preferred set for the iterator (ie., the set not
// consider to be duplicates) to be the original matches
mu_msg_iter_set_preferred (rel_iter, orig_set);
g_hash_table_destroy (orig_set);
*iter = rel_iter;
}