diff --git a/lib/mu-msg-iter.cc b/lib/mu-msg-iter.cc index dc1ee11e..c6c5061d 100644 --- a/lib/mu-msg-iter.cc +++ b/lib/mu-msg-iter.cc @@ -30,12 +30,22 @@ #include #include +#include #include "mu-util.h" #include "mu-msg.h" #include "mu-msg-iter.h" #include "mu-threader.h" + +struct ltstr { + bool operator () (const std::string &s1, + const std::string &s2) const { + return g_strcmp0 (s1.c_str(), s2.c_str()) < 0; + } +}; +typedef std::map msgid_docid_map; + class ThreadKeyMaker: public Xapian::KeyMaker { public: ThreadKeyMaker (GHashTable *threadinfo): _threadinfo(threadinfo) {} @@ -56,7 +66,7 @@ public: MuMsgFieldId sortfield, MuMsgIterFlags flags): _enq(enq), _thread_hash (0), _msg(0), _flags(flags), _skip_unreadable(flags & MU_MSG_ITER_FLAG_SKIP_UNREADABLE), - _skip_dups (flags & MU_MSG_ITER_FLAG_SKIP_DUPS) { + _skip_dups (flags & MU_MSG_ITER_FLAG_SKIP_DUPS){ bool descending = (flags & MU_MSG_ITER_FLAG_DESCENDING); bool threads = (flags & MU_MSG_ITER_FLAG_THREADS); @@ -71,24 +81,22 @@ public: if (threads) { _matches.fetch(); _cursor = _matches.begin(); - { // temporarily turn-off skipping dups - _skip_dups = FALSE; - _thread_hash = mu_threader_calculate - (this, _matches.size(), sortfield, descending); - _skip_dups = (flags & MU_MSG_ITER_FLAG_SKIP_DUPS); - } + // NOTE: temporarily turn-off skipping duplicates, since we + // need threadinfo for *all* + _skip_dups = false; + _thread_hash = mu_threader_calculate + (this, _matches.size(), sortfield, descending); + _skip_dups = (flags & MU_MSG_ITER_FLAG_SKIP_DUPS); ThreadKeyMaker keymaker(_thread_hash); enq.set_sort_by_key (&keymaker, false); _matches = _enq.get_mset (0, maxnum); - } else if (sortfield != MU_MSG_FIELD_ID_NONE) { enq.set_sort_by_value ((Xapian::valueno)sortfield, descending); _matches = _enq.get_mset (0, maxnum); _cursor = _matches.begin(); } - _cursor = _matches.begin(); } @@ -120,12 +128,30 @@ public: bool looks_like_dup () const { try { const Xapian::Document doc (cursor().get_document()); - const std::string msg_uid - (doc.get_value(MU_MSG_FIELD_ID_MSGID)); - if (_msg_uid_set.find (msg_uid) != _msg_uid_set.end()) { + const std::string msgid (doc.get_value(MU_MSG_FIELD_ID_MSGID)); + unsigned docid (doc.get_docid()); + + if (msgid.empty()) + return false; + + // is this message in the preferred map? if + // so, it's not a duplicate, otherwise, it + // isn't + msgid_docid_map::const_iterator pref_iter (_preferred_map.find (msgid)); + if (pref_iter != _preferred_map.end()) { + //std::cerr << "in the set!" << std::endl; + if ((*pref_iter).second == docid) + return false; // in the set: not a dup! + else + return true; + } + + // otherwise, simply check if we've already seen this message-id, + // and, if so, it's considered a dup + if (_msg_uid_set.find (msgid) != _msg_uid_set.end()) { return true; } else { - _msg_uid_set.insert (msg_uid); + _msg_uid_set.insert (msgid); return false; } } catch (...) { @@ -133,6 +159,17 @@ public: } } + static void each_preferred (const char *msgid, gpointer docidp, msgid_docid_map *preferred_map) { + (*preferred_map)[msgid] = GPOINTER_TO_SIZE(docidp); + } + + void set_preferred_map (GHashTable *preferred_hash) { + if (!preferred_hash) + _preferred_map.clear(); + else + g_hash_table_foreach (preferred_hash, + (GHFunc)each_preferred, &_preferred_map); + } bool skip_dups () const { return _skip_dups; } bool skip_unreadable () const { return _skip_unreadable; } @@ -147,15 +184,15 @@ private: MuMsgIterFlags _flags; - struct ltstr { - bool operator () (const std::string &s1, - const std::string &s2) const { - return g_strcmp0 (s1.c_str(), s2.c_str()) < 0; - } - }; mutable std::set _msg_uid_set; + bool _skip_unreadable; - bool _skip_unreadable, _skip_dups; + // the 'preferred map' (msgid->docid) is used when checking + // for duplicates; if a message is in the preferred map, it + // will not be excluded (but other messages with the same + // msgid will) + msgid_docid_map _preferred_map; + bool _skip_dups; }; @@ -212,21 +249,17 @@ mu_msg_iter_destroy (MuMsgIter *iter) try { delete iter; } MU_XAPIAN_CATCH_BLOCK; } + + void -mu_msg_iter_set_skip_duplicates (MuMsgIter *iter, gboolean skip_duplicates, - GHashTable *preferred_set) +mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash) { g_return_if_fail (iter); - g_return_if_fail (!skip_duplicates && preferred_set); - - - - + iter->set_preferred_map (preferred_hash); } - MuMsg* mu_msg_iter_get_msg_floating (MuMsgIter *iter) { @@ -306,7 +339,7 @@ mu_msg_iter_is_done (MuMsgIter *iter) /* hmmm.... is it impossible to get a 0 docid, or just very improbable? */ -unsigned int +unsigned mu_msg_iter_get_docid (MuMsgIter *iter) { g_return_val_if_fail (iter, (unsigned int)-1); @@ -368,8 +401,6 @@ mu_msg_iter_get_thread_id (MuMsgIter *iter) } - - const MuMsgIterThreadInfo* mu_msg_iter_get_thread_info (MuMsgIter *iter) { @@ -385,7 +416,7 @@ mu_msg_iter_get_thread_info (MuMsgIter *iter) (iter->thread_hash(), GUINT_TO_POINTER(docid)); if (!ti) - g_printerr ("no ti for %u\n", docid); + g_warning ("no ti for %u\n", docid); return ti; diff --git a/lib/mu-msg-iter.h b/lib/mu-msg-iter.h index ffd392fd..68abab6a 100644 --- a/lib/mu-msg-iter.h +++ b/lib/mu-msg-iter.h @@ -51,8 +51,6 @@ enum _MuMsgIterFlags { }; typedef unsigned MuMsgIterFlags; - - /** * create a new MuMsgIter -- basically, an iterator over the search * results @@ -128,6 +126,18 @@ MuMsg* mu_msg_iter_get_msg_floating (MuMsgIter *iter) +/** + * Provide a preferred_hash, which is a hashtable msgid->docid to + * indicate the messages which should /not/ be seen as duplicates. + * + * @param iter a valid MuMsgIter iterator + * @param preferred_hash a hashtable msgid->docid of message /not/ to + * mark as duplicates, or NULL + */ +void mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash); + + + /** * get the document id for the current message * diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 5466d2ca..59f67111 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -397,20 +397,34 @@ get_enquire (MuQuery *self, const char *searchexpr, MuMsgFieldId sortfieldid, } /* - * record all threadids for the messages + * record all threadids for the messages; also 'orig_set' receives all + * original matches (a map msgid-->docid), so we can make sure the + * originals are not seen as 'duplicates' later (when skipping + * duplicates). We want to favor the originals over the related + * messages, when skipping duplicates. */ static GHashTable* -get_thread_ids (MuMsgIter *iter) +get_thread_ids (MuMsgIter *iter, GHashTable **orig_set) { GHashTable *ids; - ids = g_hash_table_new_full (g_str_hash, g_str_equal, - (GDestroyNotify)g_free, NULL); + ids = g_hash_table_new_full (g_str_hash, g_str_equal, + (GDestroyNotify)g_free, NULL); + *orig_set = g_hash_table_new_full (g_str_hash, g_str_equal, + (GDestroyNotify)g_free, NULL); while (!mu_msg_iter_is_done (iter)) { - const char *thread_id; + const char *thread_id, *msgid; + unsigned docid; + /* record the thread id for the message */ if ((thread_id = mu_msg_iter_get_thread_id (iter))) g_hash_table_insert (ids, g_strdup (thread_id), GSIZE_TO_POINTER(TRUE)); + /* record the original set */ + docid = mu_msg_iter_get_docid(iter); + if (docid != 0 && (msgid = mu_msg_iter_get_msgid (iter))) + g_hash_table_insert (*orig_set, g_strdup (msgid), + GSIZE_TO_POINTER(docid)); + if (!mu_msg_iter_next (iter)) break; } @@ -420,7 +434,7 @@ get_thread_ids (MuMsgIter *iter) static Xapian::Query -get_related_query (MuMsgIter *iter) +get_related_query (MuMsgIter *iter, GHashTable **orig_set) { GHashTable *hash; GList *id_list, *cur; @@ -428,7 +442,9 @@ get_related_query (MuMsgIter *iter) static std::string pfx (1, mu_msg_field_xapian_prefix (MU_MSG_FIELD_ID_THREAD_ID)); - hash = get_thread_ids (iter); + /* orig_set receives the hash msgid->docid of the set of + * original matches */ + hash = get_thread_ids (iter, orig_set); /* id_list now gets a list of all thread-ids seen in the query * results; either in the Message-Id field or in * References. */ @@ -451,10 +467,12 @@ static void include_related (MuQuery *self, MuMsgIter **iter, int maxnum, MuMsgFieldId sortfieldid, MuQueryFlags flags) { + GHashTable *orig_set; Xapian::Enquire enq (self->db()); MuMsgIter *rel_iter; - enq.set_query(get_related_query (*iter)); + orig_set = NULL; + enq.set_query(get_related_query (*iter, &orig_set)); enq.set_cutoff(0,0); rel_iter= mu_msg_iter_new ( @@ -465,6 +483,12 @@ include_related (MuQuery *self, MuMsgIter **iter, int maxnum, NULL); mu_msg_iter_destroy (*iter); + + // set the preferred set for the iterator (ie., the set not + // consider to be duplicates) to be the original matches + mu_msg_iter_set_preferred (rel_iter, orig_set); + g_hash_table_destroy (orig_set); + *iter = rel_iter; }