From e5a52e45fd2e2f587be2a71dafaca29e44d981e6 Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Wed, 27 Jan 2021 22:28:58 +0200 Subject: [PATCH] mu-threads: improve sorting, duplicate handling And add some more tests. --- lib/mu-query-results.hh | 8 + lib/mu-query-threads.cc | 331 ++++++++++++++++++++++++---------------- 2 files changed, 205 insertions(+), 134 deletions(-) diff --git a/lib/mu-query-results.hh b/lib/mu-query-results.hh index 9d89a8c8..d73da1dd 100644 --- a/lib/mu-query-results.hh +++ b/lib/mu-query-results.hh @@ -109,10 +109,18 @@ struct QueryMatch { return date_key < rhs.date_key; } + bool has_flag (Flags flag) const; }; MU_ENABLE_BITOPS(QueryMatch::Flags); +inline bool +QueryMatch::has_flag(QueryMatch::Flags flag) const +{ + return any_of(flags & flag); +} + + inline std::ostream& operator<<(std::ostream& os, QueryMatch::Flags mflags) { diff --git a/lib/mu-query-threads.cc b/lib/mu-query-threads.cc index 654db868..4478dad0 100644 --- a/lib/mu-query-threads.cc +++ b/lib/mu-query-threads.cc @@ -20,6 +20,7 @@ #include "mu-query-threads.hh" #include +#include #include #include #include @@ -30,12 +31,11 @@ using namespace Mu; struct Container { - using children_type = std::set; - - Container(): children{&compare} {} - Container(Option msg): query_match{msg}, children{&compare} {} + using children_type = std::unordered_set; + Container() = default; + Container(Option msg): query_match{msg} {} Container(const Container&) = delete; - Container(Container&&) = delete; + Container(Container&&) = default; void set_parent (Container* new_parent) { assert(this != new_parent); @@ -56,7 +56,6 @@ struct Container { new_child.parent = this; children.emplace(&new_child); } - void promote_children () { for_each_child([&](auto&& child){ child->parent = {}; @@ -125,16 +124,6 @@ private: assert(this->parent != this); return parent ? parent->ur_parent() : this; } - - static bool compare(const Container *c1, const Container *c2) { - if (c1->query_match && c2->query_match) { - const auto cmp{std::strcmp(c1->query_match->date_key.c_str(), - c2->query_match->date_key.c_str())}; - if (cmp != 0) - return cmp < 0; - } - return c1 < c2; - } }; static std::ostream& @@ -158,6 +147,27 @@ operator<<(std::ostream& os, const Container& container) using IdTable = std::unordered_map; +using DupTable = std::multimap; +//template using DupsVec = std::vector; + +static void +handle_duplicates (IdTable& id_table, DupTable& dup_table) +{ + size_t n{}; + + for (auto&& dup: dup_table) { + const auto msgid{dup.first}; + auto it = id_table.find(msgid); + if (it == id_table.end()) + continue; + + // add duplicates as fake children + char buf[32]; + ::snprintf(buf, sizeof(buf), "bastard-%zu", ++n); + it->second.add_child( + id_table.emplace(buf, std::move(dup.second)).first->second); + } +} template static IdTable @@ -165,35 +175,33 @@ determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id) { // 1. For each query_match IdTable id_table; + DupTable dups; for (auto&& mi: qres) { const auto msgid{mi.message_id().value_or(*mi.path())}; + // Step 0 (non-JWZ): filter out dups, handle those at the end + if (mi.query_match().has_flag(QueryMatch::Flags::Duplicate)) { + dups.emplace(msgid, mi.query_match()); + continue; + } // 1.A If id_table contains an empty Container for this ID: // Store this query_match (query_match) in the Container's query_match (value) slot. + // Else: + // Create a new Container object holding this query_match (query-match); + // Index the Container by Query_Match-ID auto c_it = id_table.find(msgid); - if (c_it != id_table.end()) { - if (!c_it->second.query_match) { + auto& container = [&]()->Container& { + if (c_it != id_table.end()) { + assert(!c_it->second.query_match); c_it->second.query_match = mi.query_match(); - c_it->second.query_match->thread_path = "x"; + return c_it->second; } else { - /* special case, not in the JWZ algorithm: the container - * exists already and has a query_match (query-match); this - * means that we are seeing *another query_match* with a - * query_match-id we already saw... create this query_match, and - * mark it as a duplicate; use its path as the fake - * query_match-id */ - c_it = id_table.emplace(*mi.path(), mi.query_match()).first; - c_it->second.query_match->flags |= QueryMatch::Flags::Duplicate; - c_it->second.query_match->thread_path = "c"; - + // Else: + // Create a new Container object holding this query_match (query-match); + // Index the Container by Query_Match-ID + return id_table.emplace(msgid, mi.query_match()).first->second; } - } else { // Else: - // Create a new Container object holding this query_match (query-match); - // Index the Container by Query_Match-ID - c_it = id_table.emplace(msgid, mi.query_match()).first; - c_it->second.query_match->thread_path = "y"; - } + }(); - Container& container{c_it->second}; // We sort by date (ascending), *except* for the root; we don't // know what query_matchs will be at the root level yet, so remember // both. Moreover, even when sorting the top-level in descending @@ -241,9 +249,13 @@ determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id) } } + // non-JWZ: add duplicate messages. + handle_duplicates (id_table, dups); + return id_table; } + /// Recursively walk all containers under the root set. /// For each container: /// @@ -383,7 +395,6 @@ update_container_query_match (Container& container, ThreadPathVec& pvec, { if (container.is_empty()) return false; // nothing to update. - auto& qmatch{*container.query_match}; if (!container.parent) @@ -417,19 +428,28 @@ sort_siblings (Container::children_type& siblings, { if (siblings.empty()) return; - else { - const auto first{*siblings.begin()}; - if (first->query_match) - first->query_match->flags |= QueryMatch::Flags::First; - const auto last{*(--siblings.end())}; - if (last->query_match) - last->query_match->flags |= QueryMatch::Flags::Last; - } + + // sort by date. + auto compare =[](const Container *c1, const Container *c2) { + const auto cmp{std::strcmp(c1->query_match->date_key.c_str(), + c2->query_match->date_key.c_str())}; + // sort by date, or, if the same, by addresss + return cmp ? cmp < 0 : c1 < c2; + }; + std::set sorted_siblings{compare}; + for (auto&& container: siblings) + sorted_siblings.emplace(std::move(container)); + + const auto first{*sorted_siblings.begin()}; + if (first->query_match) + first->query_match->flags |= QueryMatch::Flags::First; + const auto last{*(--sorted_siblings.end())}; + if (last->query_match) + last->query_match->flags |= QueryMatch::Flags::Last; size_t idx{0}; ThreadPathVec thread_path_vec{parent_path_vec}; - - for (auto&& c: siblings) { + for (auto&& c: sorted_siblings) { thread_path_vec.emplace_back(idx++); update_container_query_match (*c, thread_path_vec, segment_size, descending); if (!c->children.empty()) @@ -448,8 +468,6 @@ sort_siblings (IdTable& id_table, bool descending) auto root_vec{determine_root_vec(id_table, descending)}; // sorted - //std::cerr << "rvs" << root_vec.size() << "\n"; - const auto seg_size = static_cast( std::ceil(std::log2(id_table.size())/4.0)); /*note: 4 == std::log2(16)*/ @@ -533,11 +551,11 @@ struct MockQueryResult { MockQueryResult(const std::string& message_id_arg, const std::vector& refs_arg={}): MockQueryResult(message_id_arg, "", "", refs_arg) {} - Option message_id() const { return message_id_;} - Option path() const { return path_;} - QueryMatch& query_match() { return query_match_;} + Option message_id() const { return message_id_;} + Option path() const { return path_;} + QueryMatch& query_match() { return query_match_;} const QueryMatch& query_match() const { return query_match_;} - const std::vector& references() const { return refs_;} + const std::vector& references() const { return refs_;} Option opt_string(MuMsgFieldId id) const { if (id == MU_MSG_FIELD_ID_DATE) @@ -577,11 +595,12 @@ using Expected = std::vector>; static void -assert_thread_paths (MockQueryResults& qrs, const Expected& expected) +assert_thread_paths (const MockQueryResults& qrs, const Expected& expected) { for (auto&& exp: expected) { auto it = std::find_if(qrs.begin(), qrs.end(), [&](auto&& qr){ - return qr.message_id().value_or("") == exp.first; + return qr.message_id().value_or("") == exp.first || + qr.path().value_or("") == exp.first; }); g_assert_true (it != qrs.end()); g_assert_cmpstr(exp.second.c_str(), ==, it->query_match().thread_path.c_str()); @@ -589,7 +608,7 @@ assert_thread_paths (MockQueryResults& qrs, const Expected& expected) } static void -test_basic() +test_sort_ascending() { auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m2"} }, @@ -606,6 +625,18 @@ test_basic() { "m3", "0" }, { "m4", "1" } }); +} + + +static void +test_sort_descending() +{ + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m2"} }, + MockQueryResult{ "m2", "b", "2", {"m3"} }, + MockQueryResult{ "m3", "c", "3", {}}, + MockQueryResult{ "m4", "d", "4", {}} + }; calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, true); @@ -617,81 +648,6 @@ test_basic() }); } - -static void -test_prune_empty_containers() -{ - { - // m7 should not be nuked - auto results = MockQueryResults { - MockQueryResult{ "x1", "a", "1", {"m7"} }, - MockQueryResult{ "x2", "b", "2", {"m7"} }, - }; - - calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); - - assert_thread_paths (results, { - { "x1", "0:0"}, - { "x2", "0:1" }, - }); - } - - { - // m7 should be nuked - - auto results = MockQueryResults { - MockQueryResult{ "m1", "a", "1", {"m7"} }, - }; - - calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); - - assert_thread_paths (results, { - { "m1", "0"}, - }); - } - - { - // m6 should be nuked - - auto results = MockQueryResults { - MockQueryResult{ "m1", "a", "1", {"m7", "m6"} }, - MockQueryResult{ "m2", "b", "2", {"m7", "m6"} }, - }; - - calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); - - assert_thread_paths (results, { - { "m1", "0:0"}, - { "m2", "0:1" }, - }); - } - - - { - // m6 should be nuked - - auto results = MockQueryResults { - MockQueryResult{ "m1", - "a", "1", - {"m28uszf59m.fsf@damtp.cam.ac.uk", - "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", - "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, - MockQueryResult{ "m2", - "b", "2", - {"m28uszf59m.fsf@damtp.cam.ac.uk", - "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", - "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, - }; - - calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); - - assert_thread_paths (results, { - { "m1", "0:0"}, - { "m2", "0:1" }, - }); - } -} - static void test_id_table_inconsistent() { @@ -704,7 +660,6 @@ test_id_table_inconsistent() }; calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); - assert_thread_paths (results, { { "m2", "0"}, { "m1", "0:0" }, @@ -714,14 +669,122 @@ test_id_table_inconsistent() }); } +static void +test_dups_dup_last() +{ + MockQueryResult r1 { "m1", "a", "1", {} }; + r1.query_match().flags |= QueryMatch::Flags::Leader; + r1.path_ = "/path1"; + + MockQueryResult r1_dup { "m1", "a", "1", {} }; + r1_dup.query_match().flags |= QueryMatch::Flags::Duplicate; + r1_dup.path_ = "/path2"; + + auto results = MockQueryResults {r1, r1_dup }; + + calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); + + assert_thread_paths (results, { + { "/path1", "0"}, + { "/path2", "0:0" }, + }); +} + +static void +test_dups_dup_first() +{ + // now dup becomes the leader; this will _demote_ + // r1. + + MockQueryResult r1_dup { "m1", "a", "1", {} }; + r1_dup.query_match().flags |= QueryMatch::Flags::Duplicate; + r1_dup.path_ = "/path1"; + + MockQueryResult r1 { "m1", "a", "1", {} }; + r1.query_match().flags |= QueryMatch::Flags::Leader; + r1.path_ = "/path2"; + + auto results = MockQueryResults { r1_dup, r1 }; + + calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); + + assert_thread_paths (results, { + { "/path2", "0"}, + { "/path1", "0:0" }, + }); +} + + +static void +test_do_not_prune_root_empty_with_children() +{ + // m7 should not be nuked + auto results = MockQueryResults { + MockQueryResult{ "x1", "a", "1", {"m7"} }, + MockQueryResult{ "x2", "b", "2", {"m7"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "x1", "0:0"}, + { "x2", "0:1" }, + }); +} + +static void +test_prune_root_empty_with_child() +{ + // m7 should be nuked + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m7"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "m1", "0"}, + }); +} + + +static void +test_prune_empty_with_children() +{ + // m6 should be nuked + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m7", "m6"} }, + MockQueryResult{ "m2", "b", "2", {"m7", "m6"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "m1", "0:0"}, + { "m2", "0:1" }, + }); +} + int main (int argc, char *argv[]) try { g_test_init (&argc, &argv, NULL); - g_test_add_func ("/threader/basic", test_basic); - g_test_add_func ("/threader/prune-empty-containers", test_prune_empty_containers); + g_test_add_func ("/threader/sort/ascending", test_sort_ascending); + g_test_add_func ("/threader/sort/decending", test_sort_descending); + g_test_add_func ("/threader/id-table-inconsistent", test_id_table_inconsistent); + g_test_add_func ("/threader/dups/dup-last", test_dups_dup_last); + g_test_add_func ("/threader/dups/dup-first", test_dups_dup_first); + + g_test_add_func ("/threader/prune/prune-root-empty-with-children", + test_prune_empty_with_children); + g_test_add_func ("/threader/prune/do-not-prune-root-empty-with-children", + test_do_not_prune_root_empty_with_children); + g_test_add_func ("/threader/prune/prune-root-empty-with-child", + test_prune_root_empty_with_child); + g_test_add_func ("/threader/prune/prune-empty-with-child", + test_prune_empty_with_children); return g_test_run ();