mirror of https://github.com/djcb/mu.git
mu-threads: improve sorting, duplicate handling
And add some more tests.
This commit is contained in:
parent
1b3fd722ef
commit
e5a52e45fd
|
@ -109,10 +109,18 @@ struct QueryMatch {
|
|||
return date_key < rhs.date_key;
|
||||
}
|
||||
|
||||
bool has_flag (Flags flag) const;
|
||||
};
|
||||
|
||||
MU_ENABLE_BITOPS(QueryMatch::Flags);
|
||||
|
||||
inline bool
|
||||
QueryMatch::has_flag(QueryMatch::Flags flag) const
|
||||
{
|
||||
return any_of(flags & flag);
|
||||
}
|
||||
|
||||
|
||||
inline std::ostream&
|
||||
operator<<(std::ostream& os, QueryMatch::Flags mflags)
|
||||
{
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "mu-query-threads.hh"
|
||||
|
||||
#include <set>
|
||||
#include <unordered_set>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
@ -30,12 +31,11 @@
|
|||
using namespace Mu;
|
||||
|
||||
struct Container {
|
||||
using children_type = std::set<Container*, bool(*)(const Container*, const Container*)>;
|
||||
|
||||
Container(): children{&compare} {}
|
||||
Container(Option<QueryMatch&> msg): query_match{msg}, children{&compare} {}
|
||||
using children_type = std::unordered_set<Container*>;
|
||||
Container() = default;
|
||||
Container(Option<QueryMatch&> msg): query_match{msg} {}
|
||||
Container(const Container&) = delete;
|
||||
Container(Container&&) = delete;
|
||||
Container(Container&&) = default;
|
||||
|
||||
void set_parent (Container* new_parent) {
|
||||
assert(this != new_parent);
|
||||
|
@ -56,7 +56,6 @@ struct Container {
|
|||
new_child.parent = this;
|
||||
children.emplace(&new_child);
|
||||
}
|
||||
|
||||
void promote_children () {
|
||||
for_each_child([&](auto&& child){
|
||||
child->parent = {};
|
||||
|
@ -125,16 +124,6 @@ private:
|
|||
assert(this->parent != this);
|
||||
return parent ? parent->ur_parent() : this;
|
||||
}
|
||||
|
||||
static bool compare(const Container *c1, const Container *c2) {
|
||||
if (c1->query_match && c2->query_match) {
|
||||
const auto cmp{std::strcmp(c1->query_match->date_key.c_str(),
|
||||
c2->query_match->date_key.c_str())};
|
||||
if (cmp != 0)
|
||||
return cmp < 0;
|
||||
}
|
||||
return c1 < c2;
|
||||
}
|
||||
};
|
||||
|
||||
static std::ostream&
|
||||
|
@ -158,6 +147,27 @@ operator<<(std::ostream& os, const Container& container)
|
|||
|
||||
|
||||
using IdTable = std::unordered_map<std::string, Container>;
|
||||
using DupTable = std::multimap<std::string, Container>;
|
||||
//template <typename QueryResultsType> using DupsVec = std::vector<decltype(QueryResultsType::value_type)>;
|
||||
|
||||
static void
|
||||
handle_duplicates (IdTable& id_table, DupTable& dup_table)
|
||||
{
|
||||
size_t n{};
|
||||
|
||||
for (auto&& dup: dup_table) {
|
||||
const auto msgid{dup.first};
|
||||
auto it = id_table.find(msgid);
|
||||
if (it == id_table.end())
|
||||
continue;
|
||||
|
||||
// add duplicates as fake children
|
||||
char buf[32];
|
||||
::snprintf(buf, sizeof(buf), "bastard-%zu", ++n);
|
||||
it->second.add_child(
|
||||
id_table.emplace(buf, std::move(dup.second)).first->second);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename QueryResultsType>
|
||||
static IdTable
|
||||
|
@ -165,35 +175,33 @@ determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id)
|
|||
{
|
||||
// 1. For each query_match
|
||||
IdTable id_table;
|
||||
DupTable dups;
|
||||
for (auto&& mi: qres) {
|
||||
const auto msgid{mi.message_id().value_or(*mi.path())};
|
||||
// Step 0 (non-JWZ): filter out dups, handle those at the end
|
||||
if (mi.query_match().has_flag(QueryMatch::Flags::Duplicate)) {
|
||||
dups.emplace(msgid, mi.query_match());
|
||||
continue;
|
||||
}
|
||||
// 1.A If id_table contains an empty Container for this ID:
|
||||
// Store this query_match (query_match) in the Container's query_match (value) slot.
|
||||
// Else:
|
||||
// Create a new Container object holding this query_match (query-match);
|
||||
// Index the Container by Query_Match-ID
|
||||
auto c_it = id_table.find(msgid);
|
||||
if (c_it != id_table.end()) {
|
||||
if (!c_it->second.query_match) {
|
||||
auto& container = [&]()->Container& {
|
||||
if (c_it != id_table.end()) {
|
||||
assert(!c_it->second.query_match);
|
||||
c_it->second.query_match = mi.query_match();
|
||||
c_it->second.query_match->thread_path = "x";
|
||||
return c_it->second;
|
||||
} else {
|
||||
/* special case, not in the JWZ algorithm: the container
|
||||
* exists already and has a query_match (query-match); this
|
||||
* means that we are seeing *another query_match* with a
|
||||
* query_match-id we already saw... create this query_match, and
|
||||
* mark it as a duplicate; use its path as the fake
|
||||
* query_match-id */
|
||||
c_it = id_table.emplace(*mi.path(), mi.query_match()).first;
|
||||
c_it->second.query_match->flags |= QueryMatch::Flags::Duplicate;
|
||||
c_it->second.query_match->thread_path = "c";
|
||||
|
||||
// Else:
|
||||
// Create a new Container object holding this query_match (query-match);
|
||||
// Index the Container by Query_Match-ID
|
||||
return id_table.emplace(msgid, mi.query_match()).first->second;
|
||||
}
|
||||
} else { // Else:
|
||||
// Create a new Container object holding this query_match (query-match);
|
||||
// Index the Container by Query_Match-ID
|
||||
c_it = id_table.emplace(msgid, mi.query_match()).first;
|
||||
c_it->second.query_match->thread_path = "y";
|
||||
}
|
||||
}();
|
||||
|
||||
Container& container{c_it->second};
|
||||
// We sort by date (ascending), *except* for the root; we don't
|
||||
// know what query_matchs will be at the root level yet, so remember
|
||||
// both. Moreover, even when sorting the top-level in descending
|
||||
|
@ -241,9 +249,13 @@ determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id)
|
|||
}
|
||||
}
|
||||
|
||||
// non-JWZ: add duplicate messages.
|
||||
handle_duplicates (id_table, dups);
|
||||
|
||||
return id_table;
|
||||
}
|
||||
|
||||
|
||||
/// Recursively walk all containers under the root set.
|
||||
/// For each container:
|
||||
///
|
||||
|
@ -383,7 +395,6 @@ update_container_query_match (Container& container, ThreadPathVec& pvec,
|
|||
{
|
||||
if (container.is_empty())
|
||||
return false; // nothing to update.
|
||||
|
||||
auto& qmatch{*container.query_match};
|
||||
|
||||
if (!container.parent)
|
||||
|
@ -417,19 +428,28 @@ sort_siblings (Container::children_type& siblings,
|
|||
{
|
||||
if (siblings.empty())
|
||||
return;
|
||||
else {
|
||||
const auto first{*siblings.begin()};
|
||||
if (first->query_match)
|
||||
first->query_match->flags |= QueryMatch::Flags::First;
|
||||
const auto last{*(--siblings.end())};
|
||||
if (last->query_match)
|
||||
last->query_match->flags |= QueryMatch::Flags::Last;
|
||||
}
|
||||
|
||||
// sort by date.
|
||||
auto compare =[](const Container *c1, const Container *c2) {
|
||||
const auto cmp{std::strcmp(c1->query_match->date_key.c_str(),
|
||||
c2->query_match->date_key.c_str())};
|
||||
// sort by date, or, if the same, by addresss
|
||||
return cmp ? cmp < 0 : c1 < c2;
|
||||
};
|
||||
std::set<Container*, decltype(compare)> sorted_siblings{compare};
|
||||
for (auto&& container: siblings)
|
||||
sorted_siblings.emplace(std::move(container));
|
||||
|
||||
const auto first{*sorted_siblings.begin()};
|
||||
if (first->query_match)
|
||||
first->query_match->flags |= QueryMatch::Flags::First;
|
||||
const auto last{*(--sorted_siblings.end())};
|
||||
if (last->query_match)
|
||||
last->query_match->flags |= QueryMatch::Flags::Last;
|
||||
|
||||
size_t idx{0};
|
||||
ThreadPathVec thread_path_vec{parent_path_vec};
|
||||
|
||||
for (auto&& c: siblings) {
|
||||
for (auto&& c: sorted_siblings) {
|
||||
thread_path_vec.emplace_back(idx++);
|
||||
update_container_query_match (*c, thread_path_vec, segment_size, descending);
|
||||
if (!c->children.empty())
|
||||
|
@ -448,8 +468,6 @@ sort_siblings (IdTable& id_table, bool descending)
|
|||
|
||||
auto root_vec{determine_root_vec(id_table, descending)}; // sorted
|
||||
|
||||
//std::cerr << "rvs" << root_vec.size() << "\n";
|
||||
|
||||
const auto seg_size = static_cast<size_t>(
|
||||
std::ceil(std::log2(id_table.size())/4.0));
|
||||
/*note: 4 == std::log2(16)*/
|
||||
|
@ -533,11 +551,11 @@ struct MockQueryResult {
|
|||
MockQueryResult(const std::string& message_id_arg,
|
||||
const std::vector<std::string>& refs_arg={}):
|
||||
MockQueryResult(message_id_arg, "", "", refs_arg) {}
|
||||
Option<std::string> message_id() const { return message_id_;}
|
||||
Option<std::string> path() const { return path_;}
|
||||
QueryMatch& query_match() { return query_match_;}
|
||||
Option<std::string> message_id() const { return message_id_;}
|
||||
Option<std::string> path() const { return path_;}
|
||||
QueryMatch& query_match() { return query_match_;}
|
||||
const QueryMatch& query_match() const { return query_match_;}
|
||||
const std::vector<std::string>& references() const { return refs_;}
|
||||
const std::vector<std::string>& references() const { return refs_;}
|
||||
|
||||
Option<std::string> opt_string(MuMsgFieldId id) const {
|
||||
if (id == MU_MSG_FIELD_ID_DATE)
|
||||
|
@ -577,11 +595,12 @@ using Expected = std::vector<std::pair<std::string, std::string>>;
|
|||
|
||||
|
||||
static void
|
||||
assert_thread_paths (MockQueryResults& qrs, const Expected& expected)
|
||||
assert_thread_paths (const MockQueryResults& qrs, const Expected& expected)
|
||||
{
|
||||
for (auto&& exp: expected) {
|
||||
auto it = std::find_if(qrs.begin(), qrs.end(), [&](auto&& qr){
|
||||
return qr.message_id().value_or("") == exp.first;
|
||||
return qr.message_id().value_or("") == exp.first ||
|
||||
qr.path().value_or("") == exp.first;
|
||||
});
|
||||
g_assert_true (it != qrs.end());
|
||||
g_assert_cmpstr(exp.second.c_str(), ==, it->query_match().thread_path.c_str());
|
||||
|
@ -589,7 +608,7 @@ assert_thread_paths (MockQueryResults& qrs, const Expected& expected)
|
|||
}
|
||||
|
||||
static void
|
||||
test_basic()
|
||||
test_sort_ascending()
|
||||
{
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m2"} },
|
||||
|
@ -606,6 +625,18 @@ test_basic()
|
|||
{ "m3", "0" },
|
||||
{ "m4", "1" }
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_sort_descending()
|
||||
{
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m2"} },
|
||||
MockQueryResult{ "m2", "b", "2", {"m3"} },
|
||||
MockQueryResult{ "m3", "c", "3", {}},
|
||||
MockQueryResult{ "m4", "d", "4", {}}
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, true);
|
||||
|
||||
|
@ -617,81 +648,6 @@ test_basic()
|
|||
});
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_prune_empty_containers()
|
||||
{
|
||||
{
|
||||
// m7 should not be nuked
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "x1", "a", "1", {"m7"} },
|
||||
MockQueryResult{ "x2", "b", "2", {"m7"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "x1", "0:0"},
|
||||
{ "x2", "0:1" },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// m7 should be nuked
|
||||
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m7"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m1", "0"},
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
// m6 should be nuked
|
||||
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m7", "m6"} },
|
||||
MockQueryResult{ "m2", "b", "2", {"m7", "m6"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m1", "0:0"},
|
||||
{ "m2", "0:1" },
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
// m6 should be nuked
|
||||
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1",
|
||||
"a", "1",
|
||||
{"m28uszf59m.fsf@damtp.cam.ac.uk",
|
||||
"CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com",
|
||||
"m2lhwxevpt.fsf@damtp.cam.ac.uk"} },
|
||||
MockQueryResult{ "m2",
|
||||
"b", "2",
|
||||
{"m28uszf59m.fsf@damtp.cam.ac.uk",
|
||||
"CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com",
|
||||
"m2lhwxevpt.fsf@damtp.cam.ac.uk"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_DATE, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m1", "0:0"},
|
||||
{ "m2", "0:1" },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_id_table_inconsistent()
|
||||
{
|
||||
|
@ -704,7 +660,6 @@ test_id_table_inconsistent()
|
|||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_DATE, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m2", "0"},
|
||||
{ "m1", "0:0" },
|
||||
|
@ -714,14 +669,122 @@ test_id_table_inconsistent()
|
|||
});
|
||||
}
|
||||
|
||||
static void
|
||||
test_dups_dup_last()
|
||||
{
|
||||
MockQueryResult r1 { "m1", "a", "1", {} };
|
||||
r1.query_match().flags |= QueryMatch::Flags::Leader;
|
||||
r1.path_ = "/path1";
|
||||
|
||||
MockQueryResult r1_dup { "m1", "a", "1", {} };
|
||||
r1_dup.query_match().flags |= QueryMatch::Flags::Duplicate;
|
||||
r1_dup.path_ = "/path2";
|
||||
|
||||
auto results = MockQueryResults {r1, r1_dup };
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_DATE, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "/path1", "0"},
|
||||
{ "/path2", "0:0" },
|
||||
});
|
||||
}
|
||||
|
||||
static void
|
||||
test_dups_dup_first()
|
||||
{
|
||||
// now dup becomes the leader; this will _demote_
|
||||
// r1.
|
||||
|
||||
MockQueryResult r1_dup { "m1", "a", "1", {} };
|
||||
r1_dup.query_match().flags |= QueryMatch::Flags::Duplicate;
|
||||
r1_dup.path_ = "/path1";
|
||||
|
||||
MockQueryResult r1 { "m1", "a", "1", {} };
|
||||
r1.query_match().flags |= QueryMatch::Flags::Leader;
|
||||
r1.path_ = "/path2";
|
||||
|
||||
auto results = MockQueryResults { r1_dup, r1 };
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_DATE, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "/path2", "0"},
|
||||
{ "/path1", "0:0" },
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_do_not_prune_root_empty_with_children()
|
||||
{
|
||||
// m7 should not be nuked
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "x1", "a", "1", {"m7"} },
|
||||
MockQueryResult{ "x2", "b", "2", {"m7"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "x1", "0:0"},
|
||||
{ "x2", "0:1" },
|
||||
});
|
||||
}
|
||||
|
||||
static void
|
||||
test_prune_root_empty_with_child()
|
||||
{
|
||||
// m7 should be nuked
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m7"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m1", "0"},
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_prune_empty_with_children()
|
||||
{
|
||||
// m6 should be nuked
|
||||
auto results = MockQueryResults {
|
||||
MockQueryResult{ "m1", "a", "1", {"m7", "m6"} },
|
||||
MockQueryResult{ "m2", "b", "2", {"m7", "m6"} },
|
||||
};
|
||||
|
||||
calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false);
|
||||
|
||||
assert_thread_paths (results, {
|
||||
{ "m1", "0:0"},
|
||||
{ "m2", "0:1" },
|
||||
});
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char *argv[]) try
|
||||
{
|
||||
g_test_init (&argc, &argv, NULL);
|
||||
|
||||
g_test_add_func ("/threader/basic", test_basic);
|
||||
g_test_add_func ("/threader/prune-empty-containers", test_prune_empty_containers);
|
||||
g_test_add_func ("/threader/sort/ascending", test_sort_ascending);
|
||||
g_test_add_func ("/threader/sort/decending", test_sort_descending);
|
||||
|
||||
g_test_add_func ("/threader/id-table-inconsistent", test_id_table_inconsistent);
|
||||
g_test_add_func ("/threader/dups/dup-last", test_dups_dup_last);
|
||||
g_test_add_func ("/threader/dups/dup-first", test_dups_dup_first);
|
||||
|
||||
g_test_add_func ("/threader/prune/prune-root-empty-with-children",
|
||||
test_prune_empty_with_children);
|
||||
g_test_add_func ("/threader/prune/do-not-prune-root-empty-with-children",
|
||||
test_do_not_prune_root_empty_with_children);
|
||||
g_test_add_func ("/threader/prune/prune-root-empty-with-child",
|
||||
test_prune_root_empty_with_child);
|
||||
g_test_add_func ("/threader/prune/prune-empty-with-child",
|
||||
test_prune_empty_with_children);
|
||||
|
||||
return g_test_run ();
|
||||
|
||||
|
|
Loading…
Reference in New Issue