From 95dffb98a637c54129404cb50edd67b456fa4f43 Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Sat, 28 Nov 2020 10:11:07 +0200 Subject: [PATCH] query: Rework querying/threading machinery Rewrite the query machinery in c++: - use an MSet decorator instead of the mu-msg-iter stuff - use mu-query-decider to mark duplicates/unreadable/related messages - use mu-query-threader to replace the older container/thread code Algorithm did not substantially change, but the implementation details did. --- lib/Makefile.am | 8 +- lib/mu-container.cc | 695 ------------------------------- lib/mu-container.hh | 223 ---------- lib/mu-msg-iter.cc | 437 -------------------- lib/mu-msg-iter.h | 246 ----------- lib/mu-query-match-deciders.cc | 231 +++++++++++ lib/mu-query-match-deciders.hh | 85 ++++ lib/mu-query-matches.hh | 206 ++++++++++ lib/mu-query-results.hh | 381 +++++++++++++++++ lib/mu-query-threads.cc | 729 +++++++++++++++++++++++++++++++++ lib/mu-query-threads.hh | 44 ++ lib/mu-query.cc | 429 ++++++++----------- lib/mu-query.hh | 43 +- lib/mu-threader.cc | 455 -------------------- lib/mu-threader.hh | 49 --- lib/test-query.cc | 91 ++++ mu/test-mu-query.cc | 79 ++-- mu/test-mu-threads.cc | 41 +- 18 files changed, 2008 insertions(+), 2464 deletions(-) delete mode 100644 lib/mu-container.cc delete mode 100644 lib/mu-container.hh delete mode 100644 lib/mu-msg-iter.cc delete mode 100644 lib/mu-msg-iter.h create mode 100644 lib/mu-query-match-deciders.cc create mode 100644 lib/mu-query-match-deciders.hh create mode 100644 lib/mu-query-matches.hh create mode 100644 lib/mu-query-results.hh create mode 100644 lib/mu-query-threads.cc create mode 100644 lib/mu-query-threads.hh delete mode 100644 lib/mu-threader.cc delete mode 100644 lib/mu-threader.hh create mode 100644 lib/test-query.cc diff --git a/lib/Makefile.am b/lib/Makefile.am index a9d35c6d..09561fa9 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -170,10 +170,10 @@ TEST_PROGS+=test-mu-tokenizer test_mu_tokenizer_SOURCES=test-tokenizer.cc test_mu_tokenizer_LDADD=libtestmucommon.la -# TEST_PROGS+=test-mu-threader -# test_mu_threader_SOURCES=mu-query-threader.cc -# test_mu_threader_LDADD=libtestmucommon.la -# test_mu_threader_CXXFLAGS=$(AM_CXXFLAGS) -DBUILD_THREADER_TEST +TEST_PROGS+=test-mu-threader +test_mu_threader_SOURCES=mu-query-threader.cc +test_mu_threader_LDADD=libtestmucommon.la +test_mu_threader_CXXFLAGS=$(AM_CXXFLAGS) -DBUILD_THREADER_TEST TEST_PROGS+=test-mu-parser test_mu_parser_SOURCES=test-parser.cc diff --git a/lib/mu-container.cc b/lib/mu-container.cc deleted file mode 100644 index c31c9298..00000000 --- a/lib/mu-container.cc +++ /dev/null @@ -1,695 +0,0 @@ -/* -** Copyright (C) 2011-2020 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify it -** under the terms of the GNU General Public License as published by the -** Free Software Foundation; either version 3, or (at your option) any -** later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ -#include "mu-container.hh" - -#include /* for memset */ -#include /* for log, ceil */ - -#include "mu-msg.h" -#include "mu-msg-iter.h" - - -/* - * path data structure, to determine the thread paths mentioned above; - * the path is filled as we're traversing the tree of MuContainers - * (messages) - */ -struct _Path { - int *_data; - guint _len; -}; -typedef struct _Path Path; - -static Path* path_new (guint initial); -static void path_destroy (Path *p); -static void path_inc (Path *p, guint index); -static gchar* path_to_string (Path *p, const char* frmt); - -MuContainer* -mu_container_new (MuMsg *msg, guint docid, const char *msgid) -{ - MuContainer *c; - - g_return_val_if_fail (!msg || docid != 0, NULL); - - c = g_slice_new0 (MuContainer); - if (msg) - c->msg = mu_msg_ref (msg); - - c->leader = c; - c->docid = docid; - c->msgid = msgid; - - return c; -} - -void -mu_container_destroy (MuContainer *c) -{ - if (!c) - return; - - if (c->msg) - mu_msg_unref (c->msg); - - g_slice_free (MuContainer, c); -} - - -static void -set_parent (MuContainer *c, MuContainer *parent) -{ - while (c) { - c->parent = parent; - c = c->next; - } -} - - -G_GNUC_UNUSED static gboolean -check_dup (MuContainer *c, GHashTable *hash) -{ - if (g_hash_table_lookup (hash, c)) { - g_warning ("ALREADY!!"); - mu_container_dump (c, TRUE); - g_assert (0); - } else - g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE)); - - return TRUE; -} - - -G_GNUC_UNUSED static void -assert_no_duplicates (MuContainer *c) -{ - GHashTable *hash; - - hash = g_hash_table_new (g_direct_hash, g_direct_equal); - - mu_container_foreach (c, - (MuContainerForeachFunc)check_dup, - hash); - - g_hash_table_destroy (hash); -} - - -MuContainer* -mu_container_append_siblings (MuContainer *c, MuContainer *sibling) -{ - g_assert (c); - - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (sibling, NULL); - g_return_val_if_fail (c != sibling, NULL); - - /* assert_no_duplicates (c); */ - - set_parent (sibling, c->parent); - - /* find the last sibling and append; first we try our cache - * 'last', otherwise we need to walk the chain. We use a - * cached last as to avoid walking the chain (which is - * O(n*n)) */ - if (c->last) - c->last->next = sibling; - else { - /* no 'last' cached, so walk the chain */ - MuContainer *c2; - for (c2 = c; c2 && c2->next; c2 = c2->next); - c2->next = sibling; - } - /* update the cached last */ - c->last = sibling->last ? sibling->last : sibling; - - /* assert_no_duplicates (c); */ - - return c; -} - -MuContainer* -mu_container_remove_sibling (MuContainer *c, MuContainer *sibling) -{ - MuContainer *cur, *prev; - - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (sibling, NULL); - - for (prev = NULL, cur = c; cur; cur = cur->next) { - - if (cur == sibling) { - if (!prev) - c = cur->next; - else - prev->next = cur->next; - break; - } - prev = cur; - } - - /* unset the cached last; it's not valid anymore - * - * TODO: we could actually do a better job updating last - * rather than invalidating it. */ - if (c) - c->last = NULL; - - return c; -} - -MuContainer* -mu_container_append_children (MuContainer *c, MuContainer *child) -{ - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (child, NULL); - g_return_val_if_fail (c != child, NULL); - - /* assert_no_duplicates (c); */ - - set_parent (child, c); - if (!c->child) - c->child = child; - else - c->child = mu_container_append_siblings (c->child, child); - - /* assert_no_duplicates (c->child); */ - - return c; -} - - -MuContainer* -mu_container_remove_child (MuContainer *c, MuContainer *child) -{ - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (child, NULL); - - /* g_assert (!child->child); */ - /* g_return_val_if_fail (!child->child, NULL); */ - g_return_val_if_fail (c != child, NULL); - - c->child = mu_container_remove_sibling (c->child, child); - - return c; -} - -typedef void (*MuContainerPathForeachFunc) (MuContainer*, gpointer, Path*); - -static void -mu_container_path_foreach_real (MuContainer *c, guint level, Path *path, - MuContainerPathForeachFunc func, - gpointer user_data) -{ - if (!c) - return; - - path_inc (path, level); - func (c, user_data, path); - - /* children */ - mu_container_path_foreach_real (c->child, level + 1, path, - func, user_data); - - /* siblings */ - mu_container_path_foreach_real (c->next, level, path, func, user_data); -} - -static void -mu_container_path_foreach (MuContainer *c, MuContainerPathForeachFunc func, - gpointer user_data) -{ - Path *path; - - path = path_new (100); - - mu_container_path_foreach_real (c, 0, path, func, user_data); - - path_destroy (path); -} - - -gboolean -mu_container_foreach (MuContainer *c, MuContainerForeachFunc func, - gpointer user_data) -{ - g_return_val_if_fail (func, FALSE); - - if (!c) - return TRUE; - - if (!mu_container_foreach (c->child, func, user_data)) - return FALSE; /* recurse into children */ - - /* recurse into siblings */ - if (!mu_container_foreach (c->next, func, user_data)) - return FALSE; - - return func (c, user_data); -} - -MuContainer* -mu_container_splice_children (MuContainer *c, MuContainer *sibling) -{ - MuContainer *children; - - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (sibling, NULL); - - children = sibling->child; - sibling->child = NULL; - - return mu_container_append_siblings (c, children); -} - -MuContainer* -mu_container_splice_grandchildren (MuContainer *parent, MuContainer *child) -{ - MuContainer *newchild; - - g_return_val_if_fail (parent, NULL); - g_return_val_if_fail (child, NULL); - g_return_val_if_fail (parent != child, NULL); - - newchild = child->child; - child->child=NULL; - - return mu_container_append_children (parent, newchild); -} - - -static GSList* -mu_container_to_list (MuContainer *c) -{ - GSList *lst; - - for (lst = NULL; c; c = c->next) - lst = g_slist_prepend (lst, c); - - return lst; -} - -static gpointer -list_last_data (GSList *lst) -{ - GSList *tail; - - tail = g_slist_last (lst); - - return tail->data; -} - -static MuContainer* -mu_container_from_list (GSList *lst) -{ - MuContainer *c, *cur, *tail; - - if (!lst) - return NULL; - - tail = (MuContainer*)list_last_data (lst); - for (c = cur = (MuContainer*)lst->data; cur; lst = g_slist_next(lst)) { - cur->next = lst ? (MuContainer*)lst->data : NULL; - cur->last = tail; - cur=cur->next; - } - - return c; -} - -struct _SortFuncData { - MuMsgFieldId mfid; - gboolean descending; - gpointer user_data; -}; -typedef struct _SortFuncData SortFuncData; - -static int -container_cmp (MuContainer *a, MuContainer *b, MuMsgFieldId mfid) -{ - if (a == b) - return 0; - else if (!a->msg) - return -1; - else if (!b->msg) - return 1; - - return mu_msg_cmp (a->msg, b->msg, mfid); -} - -static int -sort_func_root (MuContainer *a, MuContainer *b, SortFuncData *data) -{ - if (data->descending) - return container_cmp (b->leader, a->leader, data->mfid); - else - return container_cmp (a->leader, b->leader, data->mfid); -} - -static int -sort_func_child (MuContainer *a, MuContainer *b, SortFuncData *data) -{ - return container_cmp (a, b, data->mfid); -} - -static MuContainer* -container_sort(MuContainer *c, GCompareDataFunc func, SortFuncData *sfdata) -{ - GSList *lst; - - lst = mu_container_to_list (c); - lst = g_slist_sort_with_data (lst, func, sfdata); - c = mu_container_from_list (lst); - g_slist_free (lst); - - return c; -} - -static MuContainer* -container_sort_child (MuContainer *c, SortFuncData *sfdata) -{ - MuContainer *cur, *leader; - - if (!c) - return NULL; - - /* find leader */ - leader = c->leader; - for (cur = c; cur; cur = cur->next) { - if (cur->child) - cur->child = container_sort_child (cur->child, sfdata); - if (container_cmp (cur->leader, leader, sfdata->mfid) > 0) - leader = cur->leader; - } - - c = container_sort(c, (GCompareDataFunc)sort_func_child, sfdata); - - /* set parent's leader to the one found */ - c->parent->leader = leader; - - return c; -} - -static MuContainer* -container_sort_root (MuContainer *c, SortFuncData *sfdata) -{ - MuContainer *cur; - - if (!c) - return NULL; - - for (cur = c; cur; cur = cur->next) { - if (cur->child) - cur->child = container_sort_child (cur->child, sfdata); - } - - return container_sort (c, (GCompareDataFunc)sort_func_root, sfdata); -} - -MuContainer* -mu_container_sort (MuContainer *c, MuMsgFieldId mfid, gboolean descending, - gpointer user_data) -{ - SortFuncData sfdata; - - sfdata.mfid = mfid; - sfdata.descending = descending; - sfdata.user_data = user_data; - - g_return_val_if_fail (c, NULL); - g_return_val_if_fail (mu_msg_field_id_is_valid(mfid), NULL); - - return container_sort_root (c, &sfdata); -} - - -static gboolean -unequal (MuContainer *a, MuContainer *b) -{ - return a == b ? FALSE : TRUE; -} - - -gboolean -mu_container_reachable (MuContainer *haystack, MuContainer *needle) -{ - g_return_val_if_fail (haystack, FALSE); - g_return_val_if_fail (needle, FALSE); - - if (!mu_container_foreach - (haystack, (MuContainerForeachFunc)unequal, needle)) - return TRUE; - - return FALSE; -} - - -static gboolean -dump_container (MuContainer *c) -{ - const gchar* subject; - - if (!c) { - g_print ("\n"); - return TRUE; - } - - subject = (c->msg) ? mu_msg_get_subject (c->msg) : ""; - - g_print ("[%s][%s m:%p p:%p docid:%u %s]\n",c->msgid, subject, (void*)c, - (void*)c->parent, c->docid, - c->msg ? mu_msg_get_path (c->msg) : ""); - - return TRUE; -} - - -void -mu_container_dump (MuContainer *c, gboolean recursive) -{ - g_return_if_fail (c); - - if (!recursive) - dump_container (c); - else - mu_container_foreach - (c, - (MuContainerForeachFunc)dump_container, - NULL); -} - - - -static Path* -path_new (guint initial) -{ - Path *p; - - p = g_slice_new0 (Path); - - p->_data = g_new0 (int, initial); - p->_len = initial; - - return p; -} - -static void -path_destroy (Path *p) -{ - if (!p) - return; - - g_free (p->_data); - g_slice_free (Path, p); -} - -static void -path_inc (Path *p, guint index) -{ - if (index + 1 >= p->_len) { - p->_data = g_renew (int, p->_data, 2 * p->_len); - memset (&p->_data[p->_len], 0, p->_len); - p->_len *= 2; - } - - ++p->_data[index]; - p->_data[index + 1] = 0; -} - - -static gchar* -path_to_string (Path *p, const char* frmt) -{ - char *str; - guint u; - - if (!p->_data) - return NULL; - - for (u = 0, str = NULL; p->_data[u] != 0; ++u) { - - char segm[16]; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wformat-nonliteral" - g_snprintf (segm, sizeof(segm), frmt, p->_data[u] - 1); -#pragma GCC diagnostic pop - - if (!str) - str = g_strdup (segm); - else { - gchar *tmp; - tmp = g_strdup_printf ("%s:%s", str, segm); - g_free (str); - str = tmp; - } - } - - return str; -} - -static unsigned -count_colons (const char *str) -{ - unsigned num; - - num = 0; - while (str++ && *str) - if (*str == ':') - ++num; - - return num; -} - - - -static MuMsgIterThreadInfo* -thread_info_new (gchar *threadpath, gboolean root, gboolean first_child, - gboolean last_child, gboolean empty_parent, - gboolean has_child, gboolean is_dup) -{ - MuMsgIterThreadInfo *ti; - - ti = g_slice_new (MuMsgIterThreadInfo); - ti->threadpath = threadpath; - ti->level = count_colons (threadpath); /* hacky... */ - - ti->prop = MU_MSG_ITER_THREAD_PROP_NONE; - ti->prop |= root ? MU_MSG_ITER_THREAD_PROP_ROOT : 0; - ti->prop |= first_child ? MU_MSG_ITER_THREAD_PROP_FIRST_CHILD : 0; - ti->prop |= last_child ? MU_MSG_ITER_THREAD_PROP_LAST_CHILD : 0; - ti->prop |= empty_parent ? MU_MSG_ITER_THREAD_PROP_EMPTY_PARENT : 0; - ti->prop |= is_dup ? MU_MSG_ITER_THREAD_PROP_DUP : 0; - ti->prop |= has_child ? MU_MSG_ITER_THREAD_PROP_HAS_CHILD : 0; - - return ti; -} - -static void -thread_info_destroy (MuMsgIterThreadInfo *ti) -{ - if (ti) { - g_free (ti->threadpath); - g_slice_free (MuMsgIterThreadInfo, ti); - } -} - - -struct _ThreadInfo { - GHashTable *hash; - const char *format; -}; -typedef struct _ThreadInfo ThreadInfo; - - -static void -add_to_thread_info_hash (GHashTable *thread_info_hash, MuContainer *c, - char *threadpath) -{ - gboolean is_root, first_child, last_child, empty_parent, is_dup, has_child; - - /* 'root' means we're a child of the dummy root-container */ - is_root = (c->parent == NULL); - - first_child = is_root ? FALSE : (c->parent->child == c); - last_child = is_root ? FALSE : (c->next == NULL); - empty_parent = is_root ? FALSE : (!c->parent->msg); - is_dup = c->flags & MU_CONTAINER_FLAG_DUP; - has_child = c->child ? TRUE : FALSE; - - g_hash_table_insert (thread_info_hash, - GUINT_TO_POINTER(c->docid), - thread_info_new (threadpath, - is_root, - first_child, - last_child, - empty_parent, - has_child, - is_dup)); -} - -/* device a format string that is the minimum size to fit up to - * matchnum matches -- returns static memory */ -static const char* -thread_segment_format_string (size_t matchnum) -{ - unsigned digitnum; - static char frmt[16]; - - /* get the number of digits needed in a hex-representation of - * matchnum */ - digitnum = (unsigned) (ceil (log(matchnum)/log(16))); - g_snprintf (frmt, sizeof(frmt), "%%0%ux", digitnum); - - return frmt; -} - -static gboolean -add_thread_info (MuContainer *c, ThreadInfo *ti, Path *path) -{ - gchar *pathstr; - - pathstr = path_to_string (path, ti->format); - add_to_thread_info_hash (ti->hash, c, pathstr); - - return TRUE; -} - - -GHashTable* -mu_container_thread_info_hash_new (MuContainer *root_set, size_t matchnum) -{ - ThreadInfo ti; - - g_return_val_if_fail (root_set, NULL); - g_return_val_if_fail (matchnum > 0, NULL); - - /* create hash docid => thread-info */ - ti.hash = g_hash_table_new_full (g_direct_hash, g_direct_equal, - NULL, - (GDestroyNotify)thread_info_destroy); - - ti.format = thread_segment_format_string (matchnum); - - mu_container_path_foreach (root_set, - (MuContainerPathForeachFunc)add_thread_info, - &ti); - - return ti.hash; -} diff --git a/lib/mu-container.hh b/lib/mu-container.hh deleted file mode 100644 index ed17ed7c..00000000 --- a/lib/mu-container.hh +++ /dev/null @@ -1,223 +0,0 @@ -/* -** Copyright (C) 2011-2013 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify it -** under the terms of the GNU General Public License as published by the -** Free Software Foundation; either version 3, or (at your option) any -** later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#ifndef MU_CONTAINER_HH__ -#define MU_CONTAINER_HH__ - -#include -#include -#include - -enum MuContainerFlag { - MU_CONTAINER_FLAG_NONE = 0, - MU_CONTAINER_FLAG_DELETE = 1 << 0, - MU_CONTAINER_FLAG_SPLICE = 1 << 1, - MU_CONTAINER_FLAG_DUP = 1 << 2 -}; -MU_ENABLE_BITOPS(MuContainerFlag); - -/* - * MuContainer data structure, as seen in JWZs document: - * http://www.jwz.org/doc/threading.html - */ -struct MuContainer { - struct MuContainer *parent, *child, *next; - - /* note: we cache the last of the string of next->next->... - * `mu_container_append_siblings' shows up high in the - * profiles since it needs to walk to the end, and this give - * O(n*n) behavior. - * */ - struct MuContainer *last; - - /* Node in the subtree rooted at this node which comes first - * in the descending sort order, e.g. the latest message if - * sorting by date. We compare the leaders when ordering - * subtrees. */ - struct MuContainer *leader; - - MuMsg *msg; - const char *msgid; - - unsigned docid; - MuContainerFlag flags; -}; - - -/** - * create a new Container object - * - * @param msg a MuMsg, or NULL; when it's NULL, docid should be 0 - * @param docid a Xapian docid, or 0 - * @param msgid a message id, or NULL - * - * @return a new Container instance, or NULL in case of error; free - * with mu_container_destroy - */ -MuContainer* mu_container_new (MuMsg *msg, guint docid, const char* msgid); - - -/** - * free a Container object - * - * @param c a Container object, or NULL - */ -void mu_container_destroy (MuContainer *c); - - - -/** - * append new child(ren) to this container; the child(ren) container's - * parent pointer will point to this one - * - * @param c a Container instance - * @param child a child - * - * @return the Container instance with a child added - */ -MuContainer* mu_container_append_children (MuContainer *c, MuContainer *child); - -/** - * append a new sibling to this (list of) containers; all the siblings - * will get the same parent that @c has - * - * @param c a container instance - * @param sibling a sibling - * - * @return the container (list) with the sibling(s) appended - */ -MuContainer* mu_container_append_siblings (MuContainer *c, MuContainer *sibling); - -/** - * remove a _single_ child container from a container - * - * @param c a container instance - * @param child the child container to remove - * - * @return the container with the child removed; if the container did - * have this child, nothing changes - */ -MuContainer* mu_container_remove_child (MuContainer *c, MuContainer *child); - -/** - * remove a _single_ sibling container from a container - * - * @param c a container instance - * @param sibling the sibling container to remove - * - * @return the container with the sibling removed; if the container did - * have this sibling, nothing changes - */ -MuContainer* mu_container_remove_sibling (MuContainer *c, MuContainer *sibling); - -/** - * promote sibling's children to be this container's siblings - * - * @param c a container instance - * @param sibling a sibling of this container - * - * @return the container with the sibling's children promoted - */ - -MuContainer* mu_container_splice_children (MuContainer *c, - MuContainer *sibling); - -/** - * promote child's children to be parent's children - * - * @param parent a container instance - * @param child a child of this container - * - * @return the new container with it's children's children promoted - */ -MuContainer* mu_container_splice_grandchildren (MuContainer *parent, - MuContainer *child); - -typedef gboolean (*MuContainerForeachFunc) (MuContainer*, gpointer); - -/** - * execute some function on all siblings an children of some container - * (recursively) until all children have been visited or the callback - * function returns FALSE - * - * @param c a container - * @param func a function to call for each container - * @param user_data a pointer to pass to the callback function - * - * @return - */ -gboolean mu_container_foreach (MuContainer *c, - MuContainerForeachFunc func, - gpointer user_data); - -/** - * check whether container needle is a child or sibling (recursively) - * of container haystack - * - * @param haystack a container - * @param needle a container - * - * @return TRUE if needle is reachable from haystack, FALSE otherwise - */ -gboolean mu_container_reachable (MuContainer *haystack, MuContainer *needle); - - -/** - * dump the container to stdout (for debugging) - * - * @param c a container - * @param recursive whether to include siblings, children - */ -void mu_container_dump (MuContainer *c, gboolean recursive); - - -typedef int (*MuContainerCmpFunc) (MuContainer *c1, MuContainer *c2, - gpointer user_data); - -/** - * sort the tree of MuContainers, recursively; ie. each of the list of - * siblings (children) will be sorted according to @func; if the - * container is empty, the first non-empty 'leftmost' child is used. - * - * @param c a container - * @param mfid the field to sort by - * @param revert if TRUE, revert the sorting order * - * @param user_data a user pointer to pass to the sorting function - * - * @return a sorted container - */ -MuContainer* mu_container_sort (MuContainer *c, MuMsgFieldId mfid, - gboolean revert, - gpointer user_data); - - -/** - * create a hashtable with maps document-ids to information about them, - * ie. Xapian docid => MuMsgIterThreadInfo - * - * @param root_set the containers @param matchnum the number of - * matches in the list (this is needed to determine the shortest - * possible collation keys ('threadpaths') for the messages - * - * @return a hash; free with g_hash_table_destroy - */ -GHashTable* mu_container_thread_info_hash_new (MuContainer *root_set, - size_t matchnum); - -#endif /*MU_CONTAINER_HH__*/ diff --git a/lib/mu-msg-iter.cc b/lib/mu-msg-iter.cc deleted file mode 100644 index 34b88ec4..00000000 --- a/lib/mu-msg-iter.cc +++ /dev/null @@ -1,437 +0,0 @@ -/* -*- mode: c++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- -** -** Copyright (C) 2008-2013 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 3 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#include -#include - -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include "utils/mu-util.h" -#include "utils/mu-utils.hh" - -#include "mu-msg.h" -#include "mu-msg-iter.h" -#include "mu-threader.hh" - -struct ltstr { - bool operator () (const std::string &s1, - const std::string &s2) const { - return g_strcmp0 (s1.c_str(), s2.c_str()) < 0; - } -}; -typedef std::map msgid_docid_map; - -class ThreadKeyMaker: public Xapian::KeyMaker { -public: - ThreadKeyMaker (GHashTable *threadinfo): _threadinfo(threadinfo) {} - virtual std::string operator()(const Xapian::Document &doc) const { - MuMsgIterThreadInfo *ti; - ti = (MuMsgIterThreadInfo*)g_hash_table_lookup - (_threadinfo, - GUINT_TO_POINTER(doc.get_docid())); - return std::string (ti && ti->threadpath ? ti->threadpath : ""); - } -private: - GHashTable *_threadinfo; -}; - -struct _MuMsgIter { -public: - _MuMsgIter (Xapian::Enquire &enq, size_t maxnum, - MuMsgFieldId sortfield, MuMsgIterFlags flags): - _enq(enq), _thread_hash (0), _msg(0), _flags(flags), - _skip_unreadable(flags & MU_MSG_ITER_FLAG_SKIP_UNREADABLE), - _skip_dups (flags & MU_MSG_ITER_FLAG_SKIP_DUPS) { - - bool descending = (flags & MU_MSG_ITER_FLAG_DESCENDING); - bool threads = (flags & MU_MSG_ITER_FLAG_THREADS); - - // first, we get _all_ matches (G_MAXINT), based the threads - // on that, then return of those - _matches = _enq.get_mset (0, G_MAXINT); - - if (_matches.empty()) - return; - - if (threads) { - _matches.fetch(); - _cursor = _matches.begin(); - // NOTE: temporarily turn-off skipping duplicates, since we - // need threadinfo for *all* - _skip_dups = false; - _thread_hash = mu_threader_calculate - (this, _matches.size(), sortfield, descending); - _skip_dups = (flags & MU_MSG_ITER_FLAG_SKIP_DUPS); - ThreadKeyMaker keymaker(_thread_hash); - enq.set_sort_by_key (&keymaker, false); - _matches = _enq.get_mset (0, maxnum); - - } else if (sortfield != MU_MSG_FIELD_ID_NONE) { - enq.set_sort_by_value ((Xapian::valueno)sortfield, - descending); - _matches = _enq.get_mset (0, maxnum); - _cursor = _matches.begin(); - } - _cursor = _matches.begin(); - } - - ~_MuMsgIter () { - if (_thread_hash) - g_hash_table_destroy (_thread_hash); - - set_msg (NULL); - } - - const Xapian::Enquire& enquire() const { return _enq; } - Xapian::MSet& matches() { return _matches; } - - Xapian::MSet::const_iterator cursor () const { return _cursor; } - void set_cursor (Xapian::MSetIterator cur) { _cursor = cur; } - void cursor_next () { ++_cursor; } - - GHashTable *thread_hash () { return _thread_hash; } - - MuMsg *msg() const { return _msg; } - MuMsg *set_msg (MuMsg *msg) { - if (_msg) - mu_msg_unref (_msg); - return _msg = msg; - } - - MuMsgIterFlags flags() const { return _flags; } - - const std::string msgid () const { - const Xapian::Document doc (cursor().get_document()); - return doc.get_value(MU_MSG_FIELD_ID_MSGID); - } - - unsigned docid () const { - const Xapian::Document doc (cursor().get_document()); - return doc.get_docid(); - } - - bool looks_like_dup () const { - try { - const Xapian::Document doc (cursor().get_document()); - // is this message in the preferred map? if - // so, it's not a duplicate, otherwise, it - // isn't - msgid_docid_map::const_iterator pref_iter (_preferred_map.find (msgid())); - if (pref_iter != _preferred_map.end()) { - //std::cerr << "in the set!" << std::endl; - if ((*pref_iter).second == docid()) - return false; // in the set: not a dup! - else - return true; - } - - // otherwise, simply check if we've already seen this message-id, - // and, if so, it's considered a dup - if (_msg_uid_set.find (msgid()) != _msg_uid_set.end()) { - return true; - } else { - _msg_uid_set.insert (msgid()); - return false; - } - } catch (...) { - return true; - } - } - - static void each_preferred (const char *msgid, gpointer docidp, - msgid_docid_map *preferred_map) { - (*preferred_map)[msgid] = GPOINTER_TO_SIZE(docidp); - } - - void set_preferred_map (GHashTable *preferred_hash) { - if (!preferred_hash) - _preferred_map.clear(); - else - g_hash_table_foreach (preferred_hash, - (GHFunc)each_preferred, &_preferred_map); - } - - bool skip_dups () const { return _skip_dups; } - bool skip_unreadable () const { return _skip_unreadable; } - -private: - const Xapian::Enquire _enq; - Xapian::MSet _matches; - Xapian::MSet::const_iterator _cursor; - - GHashTable *_thread_hash; - MuMsg *_msg; - - MuMsgIterFlags _flags; - - mutable std::set _msg_uid_set; - bool _skip_unreadable; - - // the 'preferred map' (msgid->docid) is used when checking - // for duplicates; if a message is in the preferred map, it - // will not be excluded (but other messages with the same - // msgid will) - msgid_docid_map _preferred_map; - bool _skip_dups; -}; - -static gboolean -is_msg_file_readable (MuMsgIter *iter) -{ - gboolean readable; - std::string path - (iter->cursor().get_document().get_value(MU_MSG_FIELD_ID_PATH)); - - if (path.empty()) - return FALSE; - - readable = (access (path.c_str(), R_OK) == 0) ? TRUE : FALSE; - return readable; -} - - -MuMsgIter* -mu_msg_iter_new (XapianEnquire *enq, size_t maxnum, - MuMsgFieldId sortfield, MuMsgIterFlags flags, - GError **err) -{ - g_return_val_if_fail (enq, NULL); - /* sortfield should be set to .._NONE when we're not threading */ - g_return_val_if_fail (mu_msg_field_id_is_valid (sortfield) || - sortfield == MU_MSG_FIELD_ID_NONE, - FALSE); - try { - MuMsgIter *iter (new MuMsgIter ((Xapian::Enquire&)*enq, - maxnum, - sortfield, - flags)); - // note: we check if it's a dup even for the first message, - // since we need its uid in the set for checking later messages - if ((iter->skip_unreadable() && !is_msg_file_readable (iter)) || - (iter->skip_dups() && iter->looks_like_dup ())) - mu_msg_iter_next (iter); /* skip! */ - - return iter; - - } catch (const Xapian::DatabaseModifiedError &dbmex) { - mu_util_g_set_error (err, MU_ERROR_XAPIAN_MODIFIED, - "database was modified; please reopen"); - return 0; - - } MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN (err, MU_ERROR_XAPIAN, 0); -} - -void -mu_msg_iter_destroy (MuMsgIter *iter) -{ - try { delete iter; } MU_XAPIAN_CATCH_BLOCK; -} - -void -mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash) -{ - g_return_if_fail (iter); - iter->set_preferred_map (preferred_hash); -} - -MuMsg* -mu_msg_iter_get_msg_floating (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, NULL); - g_return_val_if_fail (!mu_msg_iter_is_done(iter), NULL); - - try { - MuMsg *msg; - GError *err; - Xapian::Document *docp; - - docp = new Xapian::Document(iter->cursor().get_document()); - - err = NULL; - msg = iter->set_msg (mu_msg_new_from_doc((XapianDocument*)docp, - &err)); - if (!msg) - MU_HANDLE_G_ERROR(err); - - return msg; - - } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); -} - -gboolean -mu_msg_iter_reset (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, FALSE); - - iter->set_msg (NULL); - - try { - iter->set_cursor(iter->matches().begin()); - - } MU_XAPIAN_CATCH_BLOCK_RETURN (FALSE); - - return TRUE; -} - -gboolean -mu_msg_iter_next (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, FALSE); - - iter->set_msg (NULL); - - if (mu_msg_iter_is_done(iter)) - return FALSE; - - try { - iter->cursor_next(); - - if (iter->cursor() == iter->matches().end()) - return FALSE; - - if ((iter->skip_unreadable() && !is_msg_file_readable (iter)) || - (iter->skip_dups() && iter->looks_like_dup ())) - return mu_msg_iter_next (iter); /* skip! */ - - return TRUE; - - } MU_XAPIAN_CATCH_BLOCK_RETURN(FALSE); -} - - -gboolean -mu_msg_iter_is_done (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, TRUE); - - try { - return iter->cursor() == iter->matches().end() ? TRUE : FALSE; - - } MU_XAPIAN_CATCH_BLOCK_RETURN (TRUE); -} - -gboolean -mu_msg_iter_is_first (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, FALSE); - - return iter->cursor() == iter->matches().begin(); -} - -gboolean -mu_msg_iter_is_last (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, FALSE); - - if (mu_msg_iter_is_done (iter)) - return FALSE; - - return iter->cursor() + 1 == iter->matches().end(); -} - -/* hmmm.... is it impossible to get a 0 docid, or just very improbable? */ -unsigned -mu_msg_iter_get_docid (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, (unsigned int)-1); - g_return_val_if_fail (!mu_msg_iter_is_done(iter), - (unsigned int)-1); - try { - return iter->docid(); - - } MU_XAPIAN_CATCH_BLOCK_RETURN ((unsigned int)-1); -} - - -char* -mu_msg_iter_get_msgid (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, NULL); - g_return_val_if_fail (!mu_msg_iter_is_done(iter), NULL); - - try { - return g_strdup (iter->msgid().c_str()); - - } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); -} - -char** -mu_msg_iter_get_refs (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, NULL); - g_return_val_if_fail (!mu_msg_iter_is_done(iter), NULL); - - try { - std::string refs ( - iter->cursor().get_document().get_value(MU_MSG_FIELD_ID_REFS)); - if (refs.empty()) - return NULL; - return g_strsplit (refs.c_str(),",", -1); - - } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); -} - -char* -mu_msg_iter_get_thread_id (MuMsgIter *iter) -{ - g_return_val_if_fail (iter, NULL); - g_return_val_if_fail (!mu_msg_iter_is_done(iter), NULL); - - try { - const std::string thread_id ( - iter->cursor().get_document().get_value(MU_MSG_FIELD_ID_THREAD_ID).c_str()); - return thread_id.empty() ? NULL : g_strdup (thread_id.c_str()); - - } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); -} - -const MuMsgIterThreadInfo* -mu_msg_iter_get_thread_info (MuMsgIter *iter) -{ - g_return_val_if_fail (!mu_msg_iter_is_done(iter), NULL); - - /* maybe we don't have thread info */ - if (!iter->thread_hash()) - return NULL; - - try { - const MuMsgIterThreadInfo *ti; - unsigned int docid; - - docid = mu_msg_iter_get_docid (iter); - ti = (const MuMsgIterThreadInfo*)g_hash_table_lookup - (iter->thread_hash(), GUINT_TO_POINTER(docid)); - - if (!ti) - g_warning ("no ti for %u\n", docid); - - return ti; - - } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); -} diff --git a/lib/mu-msg-iter.h b/lib/mu-msg-iter.h deleted file mode 100644 index bce6a502..00000000 --- a/lib/mu-msg-iter.h +++ /dev/null @@ -1,246 +0,0 @@ -/* -** Copyright (C) 2008-2013 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 3 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#ifndef __MU_MSG_ITER_H__ -#define __MU_MSG_ITER_H__ - -#include -#include - -G_BEGIN_DECLS - - -/** - * MuMsgIter is a structure to iterate over the results of a - * query. You can iterate only in one-direction, and you can do it - * only once. - * - */ - -struct _MuMsgIter; -typedef struct _MuMsgIter MuMsgIter; - - -enum _MuMsgIterFlags { - MU_MSG_ITER_FLAG_NONE = 0, - /* sort Z->A (only for threads) */ - MU_MSG_ITER_FLAG_DESCENDING = 1 << 0, - /* ignore results for which there is no existing - * readable message-file? */ - MU_MSG_ITER_FLAG_SKIP_UNREADABLE = 1 << 1, - /* ignore duplicate messages? */ - MU_MSG_ITER_FLAG_SKIP_DUPS = 1 << 2, - /* calculate threads? */ - MU_MSG_ITER_FLAG_THREADS = 1 << 3 -}; -typedef unsigned MuMsgIterFlags; - -/** - * create a new MuMsgIter -- basically, an iterator over the search - * results - * - * @param enq a Xapian::Enquire* cast to XapianEnquire* (because this - * is C, not C++),providing access to search results - * @param maxnum the maximum number of results - * @param sortfield field to sort by - * @param flags flags for this iterator (see MsgIterFlags) - - * @param err receives error information. if the error is - * MU_ERROR_XAPIAN_MODIFIED, the database should be reloaded. - * - * @return a new MuMsgIter, or NULL in case of error - */ -MuMsgIter *mu_msg_iter_new (XapianEnquire *enq, - size_t maxnum, - MuMsgFieldId sortfield, - MuMsgIterFlags flags, - GError **err) G_GNUC_WARN_UNUSED_RESULT; - -/** - * get the next message (which you got from - * e.g. mu_query_run) - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE if it succeeded, FALSE otherwise (e.g., because there - * are no more messages in the query result) - */ -gboolean mu_msg_iter_next (MuMsgIter *iter); - -/** - * Does this iterator point to the first item? - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE or FALSE - */ -gboolean mu_msg_iter_is_first (MuMsgIter *iter); - -/** - * Does this iterator point to the last item? - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE or FALSE - */ -gboolean mu_msg_iter_is_last (MuMsgIter *iter); - - -/** - * reset the iterator to the beginning - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE if it succeeded, FALSE otherwise - */ -gboolean mu_msg_iter_reset (MuMsgIter *iter); - -/** - * does this iterator point past the end of the list? - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE if the iter points past end of the list, FALSE - * otherwise - */ -gboolean mu_msg_iter_is_done (MuMsgIter *iter); - - -/** - * destroy the sequence of messages; ie. /all/ of them - * - * @param msg a valid MuMsgIter message or NULL - */ -void mu_msg_iter_destroy (MuMsgIter *iter); - - -/** - * get the corresponding MuMsg for this iter; this instance is owned - * by MuMsgIter, and becomes invalid after either mu_msg_iter_destroy - * or mu_msg_iter_next. _do not_ unref it; it's a floating reference. - * - * @param iter a valid MuMsgIter instance* - * - * @return a MuMsg instance, or NULL in case of error - */ -MuMsg* mu_msg_iter_get_msg_floating (MuMsgIter *iter) - G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; - -/** - * Provide a preferred_hash, which is a hashtable msgid->docid to - * indicate the messages which should /not/ be seen as duplicates. - * - * @param iter a valid MuMsgIter iterator - * @param preferred_hash a hashtable msgid->docid of message /not/ to - * mark as duplicates, or NULL - */ -void mu_msg_iter_set_preferred (MuMsgIter *iter, GHashTable *preferred_hash); - -/** - * get the document id for the current message - * - * @param iter a valid MuMsgIter iterator - * - * @return the docid or (unsigned int)-1 in case of error - */ -guint mu_msg_iter_get_docid (MuMsgIter *iter); - - -/** - * calculate the message threads - * - * @param iter a valid MuMsgIter iterator - * - * @return TRUE if it worked, FALSE otherwise. - */ -gboolean mu_msg_iter_calculate_threads (MuMsgIter *iter); - - -enum _MuMsgIterThreadProp { - MU_MSG_ITER_THREAD_PROP_NONE = 0 << 0, - - MU_MSG_ITER_THREAD_PROP_ROOT = 1 << 0, - MU_MSG_ITER_THREAD_PROP_FIRST_CHILD = 1 << 1, - MU_MSG_ITER_THREAD_PROP_LAST_CHILD = 1 << 2, - MU_MSG_ITER_THREAD_PROP_EMPTY_PARENT = 1 << 3, - MU_MSG_ITER_THREAD_PROP_DUP = 1 << 4, - MU_MSG_ITER_THREAD_PROP_HAS_CHILD = 1 << 5 -}; -typedef guint8 MuMsgIterThreadProp; - -struct _MuMsgIterThreadInfo { - gchar *threadpath; /* a string describing the thread-path in - * such a way that we can sort by this - * string to get the right order. */ - guint level; /* thread-depth -- [0...] */ - MuMsgIterThreadProp prop; -}; -typedef struct _MuMsgIterThreadInfo MuMsgIterThreadInfo; - -/** - * get a the MuMsgThreaderInfo struct for this message; this only - * works when you created the mu-msg-iter with threading enabled - * (otherwise, return NULL) - * - * @param iter a valid MuMsgIter iterator - * - * @return an info struct - */ -const MuMsgIterThreadInfo* mu_msg_iter_get_thread_info (MuMsgIter *iter); - -/** - * get the message-id for this message - * - * @param iter a valid MuMsgIter iterator - * - * @return the message-id; free with g_free(). - */ -char* mu_msg_iter_get_msgid (MuMsgIter *iter) - G_GNUC_WARN_UNUSED_RESULT; - -/** - * get the list of references for this messages as a NULL-terminated - * string array - * - * @param iter a valid MuMsgIter iterator - * - * @return a NULL-terminated string array. free with g_strfreev when - * it's no longer needed. - */ -char** mu_msg_iter_get_refs (MuMsgIter *iter) - G_GNUC_WARN_UNUSED_RESULT; - - -/** - * get the thread-id for this message - * - * @param iter a valid MuMsgIter iterator - * - * @return the thread-id; free with g_free(). - */ -char* mu_msg_iter_get_thread_id (MuMsgIter *iter) - G_GNUC_WARN_UNUSED_RESULT; - - -/* FIXME */ -const char* mu_msg_iter_get_path (MuMsgIter *iter); - -G_END_DECLS - -#endif /*__MU_MSG_ITER_H__*/ diff --git a/lib/mu-query-match-deciders.cc b/lib/mu-query-match-deciders.cc new file mode 100644 index 00000000..a3190c5f --- /dev/null +++ b/lib/mu-query-match-deciders.cc @@ -0,0 +1,231 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-match-deciders.hh" + +#include "mu-query-results.hh" +#include "utils/mu-option.hh" + +using namespace Mu; + +// We use a MatchDecider to gather information about the matches, and decide +// whether to include them in the results. +// +// Note that to include the "related" messages, we need _two_ queries; the first +// one to get the initial matches (called the Leader-Query) and a Related-Query, to get +// the Leader matches + all messages that have a thread-id seen in the Leader +// matches. +// +// We use the MatchDecider to gather information and use it for both queries. + +struct MatchDecider: public Xapian::MatchDecider { + MatchDecider (QueryFlags qflags, DeciderInfo& info): + qflags_{qflags}, decider_info_{info} + {} + /** + * Update the match structure with unreadable/duplicate flags + * + * @param doc a Xapian document. + * + * @return a new QueryMatch object + */ + QueryMatch make_query_match (const Xapian::Document& doc) const { + + QueryMatch qm{}; + + auto msgid {opt_string(doc, MU_MSG_FIELD_ID_MSGID) + .value_or(*opt_string(doc, MU_MSG_FIELD_ID_PATH))}; + if (!decider_info_.message_ids.emplace(std::move(msgid)).second) + qm.flags |= QueryMatch::Flags::Duplicate; + + const auto path{opt_string(doc, MU_MSG_FIELD_ID_PATH)}; + if (!path || ::access(path->c_str(), R_OK) != 0) + qm.flags |= QueryMatch::Flags::Unreadable; + + return qm; + } + + bool should_include (const QueryMatch& qm) const { + + if (any_of(qflags_ & QueryFlags::SkipDuplicates) && + any_of(qm.flags & QueryMatch::Flags::Duplicate)) + return false; + + if (any_of(qflags_ & QueryFlags::SkipUnreadable) && + any_of(qm.flags & QueryMatch::Flags::Unreadable)) + return false; + + return true; + } + /** + * Gather thread ids from this match. + * + * @param doc the document (message) + * + */ + void gather_thread_ids(const Xapian::Document& doc) const { + auto thread_id{opt_string(doc, MU_MSG_FIELD_ID_THREAD_ID)}; + if (thread_id) + decider_info_.thread_ids.emplace(std::move(*thread_id)); + } + +protected: + const QueryFlags qflags_; + DeciderInfo& decider_info_; +private: + Option opt_string(const Xapian::Document& doc, MuMsgFieldId id) const noexcept try { + auto&& val{doc.get_value(id)}; + return val.empty() ? Nothing : Some(val); + } MU_XAPIAN_CATCH_BLOCK_RETURN (Nothing); +}; + +struct MatchDeciderLeader: public MatchDecider { + MatchDeciderLeader (QueryFlags qflags, DeciderInfo& info): + MatchDecider(qflags, info) + {} + /** + * operator() + * + * This receives the documents considered during a Xapian query, and + * is to return either true (keep) or false (ignore) + * + * We use this to potentiallly avoid certain messages (documents): + * - with QueryFlags::SkipUnreadable this will return false for message + * that are not readable in the file-system + * - with QueryFlags::SkipDuplicates this will return false for messages + * whose message-id was seen before. + * + * Even if we do not skip these messages entirely, we remember whether + * they were unreadabld/duplicate (in the QueryMatch::Flags), so we can + * quickly find that info when doing the second 'related' query. + * + * The "leader" query. Matches here get the Leader flag unless their + * duplicates / unreadable. We check the duplicate/readable status + * regardless of whether SkipDuplicates/SkipUnreadable was passed + * (to gather that information); however those flags + * affect our true/false verdict. + * + * @param doc xapian document + * + * @return true or false + */ + bool operator() (const Xapian::Document& doc) const override { + // by definition, we haven't seen the docid before, + // so no need to search + const auto it = decider_info_.matches.emplace(doc.get_docid(), + make_query_match(doc)); + if (should_include(it.first->second)) { + if (any_of(qflags_ & QueryFlags::GatherThreadIds)) + gather_thread_ids(doc); + return true; + } + + return false; + } +}; + + +std::unique_ptr +Mu::make_leader_decider (QueryFlags qflags, DeciderInfo& info) +{ + return std::make_unique(qflags, info); +} + +struct MatchDeciderRelated: public MatchDecider { + MatchDeciderRelated(QueryFlags qflags, DeciderInfo& info): + MatchDecider(qflags, info) {} + /** + * operator() + * + * This receives the documents considered during a Xapian query, and + * is to return either true (keep) or false (ignore) + * + * We use this to potentiallly avoid certain messages (documents): + * - with QueryFlags::SkipUnreadable this will return false for message + * that are not readable in the file-system + * - with QueryFlags::SkipDuplicates this will return false for messages + * whose message-id was seen before. + * + * Even if we do not skip these messages entirely, we remember whether + * they were unreadabld/duplicate (in the QueryMatch::Flags), so we can + * quickly find that info when doing the second 'related' query. + * + * The "leader" query. Matches here get the Leader flag unless their + * duplicates / unreadable. We check the duplicate/readable status + * regardless of whether SkipDuplicates/SkipUnreadable was passed + * (to gather that information); however those flags + * affect our true/false verdict. + * + * @param doc xapian document + * + * @return true or false + */ + bool operator() (const Xapian::Document& doc) const override { + // we may have seen this match in the "Leader" query. + auto it = decider_info_.matches.find(doc.get_docid()); + if (it != decider_info_.matches.end()) + return should_include(it->second); + else { // nope; create it. + const auto new_it = decider_info_.matches.emplace( + doc.get_docid(), make_query_match(doc)); + return should_include(new_it.first->second); + } + } +}; + + +std::unique_ptr +Mu::make_related_decider (QueryFlags qflags, DeciderInfo& info) +{ + return std::make_unique(qflags, info); +} + + +struct MatchDeciderFinal: public MatchDecider { + MatchDeciderFinal(QueryFlags qflags, DeciderInfo& info): + MatchDecider{qflags, info} {} + /** + * operator() + * + * This receives the documents considered during a Xapian query, and + * is to return either true (keep) or false (ignore) + * + * Only include documents that earlier checks have decided to include. + * + * @param doc xapian document + * + * @return true or false + */ + bool operator() (const Xapian::Document& doc) const override { + // we may have seen this match in the "Leader" query. + auto it = decider_info_.matches.find(doc.get_docid()); + if (G_UNLIKELY(it == decider_info_.matches.end())) { + g_warning ("could not find %u", doc.get_docid()); + return false; + } else + return should_include(it->second); + } +}; + + +std::unique_ptr +Mu::make_final_decider (QueryFlags qflags, DeciderInfo& info) +{ + return std::make_unique(qflags, info); +} diff --git a/lib/mu-query-match-deciders.hh b/lib/mu-query-match-deciders.hh new file mode 100644 index 00000000..f9f42f24 --- /dev/null +++ b/lib/mu-query-match-deciders.hh @@ -0,0 +1,85 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + + +#ifndef MU_QUERY_MATCH_DECIDERS_HH__ +#define MU_QUERY_MATCH_DECIDERS_HH__ + +#include +#include +#include + +#include + +#include "mu-query-results.hh" + + +namespace Mu { +using StringSet = std::unordered_set; + + +struct DeciderInfo { + QueryMatches matches; + StringSet thread_ids; + StringSet message_ids; +}; + +/** + * Make a "leader" decider, that is, a MatchDecider for either a singular or the + * first query in the leader/related pair of queries. Gather information for + * threading, and the subsequent "related" query. +* + * @param qflags query flags + * @param match_info receives information about the matches. + * + * @return a unique_ptr to a match decider. + */ +std::unique_ptr make_leader_decider(QueryFlags qflags, + DeciderInfo& info); + + +/** + * Make a "related" decider, that is, a MatchDecider for the second query + * in the leader/related pair of queries. + * + * @param qflags query flags + * @param match_info receives information about the matches. + * + * @return a unique_ptr to a match decider. + */ +std::unique_ptr make_related_decider(QueryFlags qflags, + DeciderInfo& info); + + +/** + * Make a "final" decider, that is, a MatchDecider that removes all but + * the document excepts for the ones included earlier. + * + * @param qflags query flags + * @param match_info receives information about the matches. + * + * @return a unique_ptr to a match decider. + */ +std::unique_ptr make_final_decider (QueryFlags qflags, + DeciderInfo& info); + + +} // namepace Mu + +#endif /* MU_QUERY_MATCH_DECIDERS_HH__ */ diff --git a/lib/mu-query-matches.hh b/lib/mu-query-matches.hh new file mode 100644 index 00000000..88bc1ac8 --- /dev/null +++ b/lib/mu-query-matches.hh @@ -0,0 +1,206 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_QUERY_MATCHES_HH__ +#define MU_QUERY_MATCHES_HH__ + +#include +#include +#include + +#include +#include "mu-msg.h" + +namespace Mu { + + +struct QueryMatchInfo { + enum struct Flags { + Seen, + Preferred, + Unreadable, + Duplicate + }; + const std::string message_id; + QueryMatchFlags flags; +}; +MU_ENABLE_BITOPS(QueryMatchInfo::Flags); + +using MatchInfo = std::unordered_map; + +struct QueryResults { + enum struct Flags { + None, + Descending, + SkipUnreadable, + SkipDups, + DetermineThreads + }; + + QueryResults (const Xapian::MSet& mset, MatchInfo&& match_info, Flags flags): + mset_{mset}, match_info_(std::Move(match_info), flag_{flags} {} + bool empty() const { return mset_.empty(); } + size_t size() const { return mset_.size(); } + + QueryResultsIterator begin() const { return QueryResultsIterator(mset_.begin()); } + QueryResultsIterator end() const { return QueryResultsIterator(mset_.end()); } + +private: + const Xapian::MSet mset_; + const Flags flags_; + MatchInfo match_info_; +}; + +/// +/// This is a view over the Document MSet, which can optionally filter outlook +/// unreadable / duplicate messages. +/// +class QueryResultsIterator { +public: + using iterator_category = std::output_iterator_tag; + using value_type = MuMsg*; + using difference_type = void; + using pointer = void; + using reference = void; + + QueryResultsIterator(Xapian::MSetIterator it, size_t max_num, + MuMsgFieldId sort_field, MuMsgIterFlags flags, + MatchInfo& minfo): + it_{it}, match_info_{minfo} {} + + QueryResultsIterator& operator++() { return ++it_; return skip();} + QueryResultsIterator& operator++(int) { return it_++; return skip()} + + /** + * Get the Xapian document this iterator is pointing at, + * or an empty document when looking at end(). + * + * @return a document + */ + Xapian::Document document() const() { + g_return_val_if_fail(it_ != Xapian::MSetIterator::end(), {}); + return it_.get_document(); + } + + /** + * Get the doc-id for the document this iterator is pointing at, or 0 + * when looking at end. + * + * @return a doc-id. + */ + Xapian::docid doc_id() const { + g_return_val_if_fail(it_ != Xapian::MSetIterator::end(), 0); + return it_.docid(); + } + + /** + * Get the message-id for the document (message) this iterator is + * pointing at, or "" when looking at end. + * + * @return a message-id + */ + std::string message_id() const { + g_return_val_if_fail(it_ != Xapian::MSetIterator::end(), ""); + return document().get_value(MU_MSG_FIELD_ID_MSGID); + } + + /** + * Get the file-system path for the document (message) this iterator is + * pointing at, or "" when looking at end. + * + * @return a filesystem path + */ + std::string path() const { + g_return_val_if_fail(it_ != Xapian::MSetIterator::end(), ""); + return document().get_value(MU_MSG_FIELD_ID_PATH); + } + + /** + * Get the references for the document (messages) this is iterator is + * pointing at, or empty if pointing at end of if no references are + * available. + * + * @return references + */ + std::vector references() const { + g_return_val_if_fail(it_ != Xapian::MSetIterator::end(), {}); + return split(document().get_value(MU_MSG_FIELD_ID_REFS), ","); + } + +private: + /** + * Filter out some documents + * + * @param forward whether to skip forward when a document is filtered + * out. + * + * @return the first iterator that is not filtered out, or the end + * iterator. + */ + QueryResultsIterator& maybe_skip(bool forward=true) { + + if (it_ = MSetIterator::end()) + return *this; // nothing to do. + + // Find or create MatchInfo + const auto msgid{message_id()}; + auto mi=[&] { + // seen before? + auto m{match_info_.find(docid)}; + if (m != match_info_.end()) + return m; + // nope; create. + QueryMatchInfo minfo { message_id() }; + // not seen before; check. + if (any_of(flags_ & SkipDups) && + match_info_.count(message_id())) + minfo.flags |= Flags::Duplicate; // it's a duplicate + + if (any_of(flags_ & SkipUnreadable) && + ::access(path().c_str(), R_OK) != 0) + minfo.flags |= Flags::Unreadable; + + return match_info_.emplace_back(std::move(minfo)); + }(); + + // note: SkipDups / SkipUnreadable are not set if + // if we're not checking for those. + + if (any_of(mi->second.flags_ & SkipDups) || + any_of(mi->second.flags_ & SkipUnreadable)) { + if (forward) + ++it_; + else + --it_; + + return maybe_skip(); + } + + return *this; + } + + Xapian::MSetIterator it_; + MatchInfo& match_info_; +}; + + +}; // namespace Mu + + +#endif /* MU_QUERY_MATCHES_HH__ */ diff --git a/lib/mu-query-results.hh b/lib/mu-query-results.hh new file mode 100644 index 00000000..9d89a8c8 --- /dev/null +++ b/lib/mu-query-results.hh @@ -0,0 +1,381 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_QUERY_RESULTS_HH__ +#define MU_QUERY_RESULTS_HH__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "mu-msg.hh" + +namespace Mu { + +/** + * This implements a QueryResults structure, which capture the results of a + * Xapian query, and a QueryResultsIterator, which gives C++-compliant iterator + * to go over the results. and finally QueryThreader (in query-threader.cc) which + * calculates the threads, using the JWZ algorithm. + */ + +/// Flags that influence now matches are presented (or skipped) +enum struct QueryFlags { + None = 0, /**< no flags */ + Descending = 1 << 0, /**< sort z->a */ + SkipUnreadable = 1 << 1, /**< skip unreadable msgs */ + SkipDuplicates = 1 << 2, /**< skip duplicate msgs */ + IncludeRelated = 1 << 3, /**< include related msgs */ + Threading = 1 << 4, /**< calculate threading info */ + // internal + Leader = 1 << 5, /**< This is the leader query (for internal use + * only)*/ + GatherThreadIds = 1 << 6, /**< Gather thread info */ +}; +MU_ENABLE_BITOPS(QueryFlags); + + +/// Register some information about a match (i.e., message) that we can use for +/// subsequent queries. +using ThreadPathVec=std::vector; +inline std::string +to_string (const ThreadPathVec& tpath, size_t digits) +{ + std::string str; + str.reserve(tpath.size() * digits); + + bool first{true}; + for (auto&& segm: tpath) { + str += format("%s%0*x", first ? "" : ":", (int)digits, segm); + first = false; + } + + return str; +} + +/// Stores all the essential information for sorting the results. +struct QueryMatch { + /// Flags for a match (message) found + enum struct Flags { + None = 0, /**< No Flags */ + Leader = 1 << 0, /**< Mark direct matches as leader */ + Related = 1 << 1, /**< A related message */ + Unreadable = 1 << 2, /**< No readable file */ + Duplicate = 1 << 3, /**< Message-id seen before */ + Root = 1 << 10, /**< Is this the thread-root? */ + First = 1 << 11, /**< Is this the first message in a thread? */ + Last = 1 << 12, /**< Is this the last message in a thread? */ + Orphan = 1 << 13, /**< Is this message without a parent? */ + HasChild = 1 << 14 /**< Does this message have a child? */ + }; + + + Flags flags{Flags::None}; /**< Flags */ + std::string sort_key; /**< The main sort-key (for the root level) */ + std::string date_key; /**< The date-key (for sorting all sub-root levels) */ + size_t thread_level{}; /**< The thread level */ + std::string thread_path; /**< The hex-numerial path in the thread, ie. '00:01:0a' */ + + bool operator<(const QueryMatch& rhs) const { + return date_key < rhs.date_key; + } + +}; + +MU_ENABLE_BITOPS(QueryMatch::Flags); + +inline std::ostream& +operator<<(std::ostream& os, QueryMatch::Flags mflags) +{ + if (mflags == QueryMatch::Flags::None) { + os << ""; + return os; + } + + if (any_of(mflags & QueryMatch::Flags::Leader)) + os << "leader "; + if (any_of(mflags & QueryMatch::Flags::Unreadable)) + os << "unreadable "; + if (any_of(mflags & QueryMatch::Flags::Duplicate)) + os << "dup "; + + if (any_of(mflags & QueryMatch::Flags::Root)) + os << "root "; + if (any_of(mflags & QueryMatch::Flags::Related)) + os << "related "; + if (any_of(mflags & QueryMatch::Flags::First)) + os << "first "; + if (any_of(mflags & QueryMatch::Flags::Last)) + os << "last "; + if (any_of(mflags & QueryMatch::Flags::Orphan)) + os << "orphan "; + if (any_of(mflags & QueryMatch::Flags::HasChild)) + os << "has-child "; + + return os; +} + + +using QueryMatches = std::unordered_map; + +inline std::ostream& +operator<<(std::ostream& os, const QueryMatch& qmatch) +{ + os << "qm:[" << qmatch.thread_path << "] (" << qmatch.thread_level << "): " + << "sort-key:<" << qmatch.sort_key << "> date:<" << qmatch.date_key << "> " + << "flags:{" << qmatch.flags << "}"; + + return os; +} + +/// +/// This is a view over the Xapian::MSet, which can optionally filter unreadable +/// / duplicate messages. +/// +/// Note, we internally skip unreadable/duplicate messages (when asked too); those +/// skipped ones do _not_ count towards the max_size +/// +class QueryResultsIterator { +public: + using iterator_category = std::output_iterator_tag; + using value_type = MuMsg*; + using difference_type = void; + using pointer = void; + using reference = void; + + QueryResultsIterator(Xapian::MSetIterator mset_it, QueryMatches& query_matches): + mset_it_{mset_it}, query_matches_{query_matches} + {} + ~QueryResultsIterator() { g_clear_pointer (&msg_, mu_msg_unref); } + + + /** + * Increment the iterator (we don't support post-increment) + * + * @return an updated iterator, or end() if we were already at end() + */ + QueryResultsIterator& operator++() { ++mset_it_; return *this; } + + /** + * (Non)Equivalence operators + * + * @param rhs some other iterator + * + * @return true or false + */ + bool operator==(const QueryResultsIterator& rhs) const { return mset_it_ == rhs.mset_it_; } + bool operator!=(const QueryResultsIterator& rhs) const { return mset_it_ != rhs.mset_it_; } + + QueryResultsIterator& operator*() { return *this; } + const QueryResultsIterator& operator*() const { return *this; } + + /** + * Get the Xapian document this iterator is pointing at, + * or an empty document when looking at end(). + * + * @return a document + */ + Xapian::Document document() const { return mset_it_.get_document(); } + + /** + * Get the doc-id for the document this iterator is pointing at, or 0 + * when looking at end. + * + * @return a doc-id. + */ + Xapian::docid doc_id() const { return *mset_it_; } + + /** + * Get the message-id for the document (message) this iterator is + * pointing at, or not when not available + * + * @return a message-id + */ + Option message_id() const noexcept { return opt_string(MU_MSG_FIELD_ID_MSGID); } + + /** + * Get the thread-id for the document (message) this iterator is + * pointing at, or "" when looking at end. + * + * @return a message-id + */ + Option thread_id() const noexcept { return opt_string(MU_MSG_FIELD_ID_THREAD_ID); } + + /** + * Get the file-system path for the document (message) this iterator is + * pointing at, or "" when looking at end. + * + * @return a filesystem path + */ + Option path() const noexcept { return opt_string(MU_MSG_FIELD_ID_PATH); } + + /** + * Get the references for the document (messages) this is iterator is + * pointing at, or empty if pointing at end of if no references are + * available. + * + * @return references + */ + std::vector references() const noexcept { + return split(document().get_value(MU_MSG_FIELD_ID_REFS), ","); + } + + /** + * Get some value from the document, or Nothing if empty. + * + * @param id a message field id + * + * @return the value + */ + Option opt_string(MuMsgFieldId id) const noexcept try { + auto&& val{document().get_value(id)}; + return val.empty() ? Nothing : Some(val); + } MU_XAPIAN_CATCH_BLOCK_RETURN (Nothing); + + /** + * Get the Query match info for this message. + * + * @return the match info. + */ + QueryMatch& query_match() { + g_assert(query_matches_.find(document().get_docid()) != query_matches_.end()); + return query_matches_.find(document().get_docid())->second; + } + const QueryMatch& query_match() const { + g_assert(query_matches_.find(document().get_docid()) != query_matches_.end()); + return query_matches_.find(document().get_docid())->second; + } + + /** + * get the corresponding MuMsg for this iter; this instance is owned by + * @this, and becomes invalid when iterating to the next, or @this is +k * destroyed.; it's a 'floating' reference. + * + * @return a MuMsg* or NUL in case of error + */ + MuMsg* floating_msg () + G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT try { + auto docp{reinterpret_cast( + new Xapian::Document(document()))}; + GError *err{}; + g_clear_pointer(&msg_, mu_msg_unref); + if (!(msg_ = mu_msg_new_from_doc(docp, &err))) { + delete docp; + g_warning ("failed to crate message for %s: %s", + path().value_or("").c_str(), + err ? err->message : "somethng went wrong"); + g_clear_error(&err); + } + + return msg_; + + } MU_XAPIAN_CATCH_BLOCK_RETURN (NULL); +private: + Xapian::MSetIterator mset_it_; + QueryMatches& query_matches_; + MuMsg *msg_{}; +}; + +constexpr auto MaxQueryResultsSize = std::numeric_limits::max(); + +class QueryResults { +public: + /// Helper types + using iterator = QueryResultsIterator; + using const_iterator = const iterator; + + /** + * Construct a QueryResults object + * + * @param mset an Xapian::MSet with matches + */ + QueryResults (const Xapian::MSet& mset, QueryMatches&& query_matches): + mset_{mset}, + query_matches_{std::move(query_matches)} + {} + /** + * Is this QueryResults object empty (ie., no matches)? + * + * @return true are false + */ + bool empty() const { return mset_.empty(); } + + /** + * Get the number of matches in this QueryResult + * + * @return number of matches + */ + size_t size() const { return mset_.size(); } + + /** + * Get the begin iterator to the results. + * + * @return iterator + */ + iterator begin() { + return QueryResultsIterator(mset_.begin(), query_matches_); + } + const iterator begin() const { + return QueryResultsIterator(mset_.begin(), query_matches_); + } + + /** + * Get the end iterator to the results. + * + * @return iterator + */ + iterator end() { + return QueryResultsIterator(mset_.end(), query_matches_); + } + const_iterator end() const { + return QueryResultsIterator(mset_.end(), query_matches_); + } + + /** + * Get the query-matches for these QueryResults. The non-const + * version can be use to _steal_ the query results, by moving + * them. + * + * @return query-matches + */ + const QueryMatches& query_matches() const { return query_matches_; } + QueryMatches& query_matches() { return query_matches_; } + +private: + const Xapian::MSet mset_; + mutable QueryMatches query_matches_; +}; + +} // namespace Mu + + +#endif /* MU_QUERY_RESULTS_HH__ */ diff --git a/lib/mu-query-threads.cc b/lib/mu-query-threads.cc new file mode 100644 index 00000000..87a4497f --- /dev/null +++ b/lib/mu-query-threads.cc @@ -0,0 +1,729 @@ +/* +** Copyright (C) 2021 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-query-threads.hh" + +#include +#include +#include +#include +#include + +#include + +using namespace Mu; + +struct Container { + using children_type = std::set; + + Container(): children{&compare} {} + Container(Option msg): query_match{msg}, children{&compare} {} + Container(const Container&) = delete; + Container(Container&&) = delete; + + void set_parent (Container* new_parent) { + assert(this != new_parent); + assert(!new_parent->is_reachable(this)); + if (new_parent == parent) + return; + if (parent) + parent->remove_child(*this); + if (new_parent) + new_parent->add_child(*this); + else + parent = new_parent; + assert(this->parent != this); + } + + void add_child (Container& new_child) { + assert(!new_child.parent); + new_child.parent = this; + children.emplace(&new_child); + } + + void promote_children () { + for_each_child([&](auto&& child){ + child->parent = {}; + if (parent) + parent->add_child(*child); + }); + children.clear(); + if (parent) + parent->remove_child(*this); + is_nuked = true; + assert(!parent); + assert(children.empty()); + } + void remove_child (Container& child) { + assert(has_child(child)); + child.parent = {}; + children.erase(&child); + assert(!has_child(child)); + } + + bool has_child (Container& child) const { + return children.find(&child) != children.cend(); + } + + bool is_reachable(Container* other) const { + return ur_parent() == other->ur_parent(); + } + + void borrow_query_match (Container& other) { + assert(!query_match); + assert(other.query_match); + query_match = other.query_match; + is_borrowed_query_match = true; + if (parent) { // and renew (for sorting) + auto p{parent}; + parent->remove_child(*this); + p->add_child(*this); + assert(parent->has_child(*this)); + } + } + + template void for_each_child (Func&& func) { + auto it{children.begin()}; + while (it != children.end()) { + auto next = std::next(it); + func(*it); + it = next; + } + } + + bool is_empty() const { + return !query_match || is_borrowed_query_match; + } + + Option query_match; + bool is_borrowed_query_match{}; + bool is_nuked{}; + + Container* parent{}; + children_type children; + +private: + const Container* ur_parent() const { + assert(this->parent != this); + return parent ? parent->ur_parent() : this; + } + + static bool compare(const Container *c1, const Container *c2) { + if (c1->query_match && c2->query_match) { + const auto cmp{std::strcmp(c1->query_match->date_key.c_str(), + c2->query_match->date_key.c_str())}; + if (cmp != 0) + return cmp < 0; + } + return c1 < c2; + } +}; + +static std::ostream& +operator<<(std::ostream& os, const Container& container) +{ + os << "container: " << std::right << std::setw(10) << &container + << ": parent: " << std::right << std::setw(10) << container.parent + << "\n children: "; + + for (auto&& c: container.children) + os << std::right << std::setw(10) << c << " "; + + os << (container.is_nuked ? " nuked" : "") + << (container.is_borrowed_query_match ? " borrowed" : ""); + + if (container.query_match) + os << "\n " << container.query_match.value(); + + return os; +} + + +using IdTable = std::unordered_map; + +template +static IdTable +determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id) +{ + // 1. For each query_match + IdTable id_table; + for (auto&& mi: qres) { + const auto msgid{mi.message_id().value_or(*mi.path())}; + // 1.A If id_table contains an empty Container for this ID: + // Store this query_match (query_match) in the Container's query_match (value) slot. + auto c_it = id_table.find(msgid); + if (c_it != id_table.end()) { + if (!c_it->second.query_match) { + c_it->second.query_match = mi.query_match(); + c_it->second.query_match->thread_path = "x"; + } else { + /* special case, not in the JWZ algorithm: the container + * exists already and has a query_match (query-match); this + * means that we are seeing *another query_match* with a + * query_match-id we already saw... create this query_match, and + * mark it as a duplicate; use its path as the fake + * query_match-id */ + c_it = id_table.emplace(*mi.path(), mi.query_match()).first; + c_it->second.query_match->flags |= QueryMatch::Flags::Duplicate; + c_it->second.query_match->thread_path = "c"; + + } + } else { // Else: + // Create a new Container object holding this query_match (query-match); + // Index the Container by Query_Match-ID + c_it = id_table.emplace(msgid, mi.query_match()).first; + c_it->second.query_match->thread_path = "y"; + } + + Container& container{c_it->second}; + // We sort by date (ascending), *except* for the root; we don't + // know what query_matchs will be at the root level yet, so remember + // both. Moreover, even when sorting the top-level in descending + // order, still sort the thread levels below that in ascending + // order. + if (sortfield_id != MU_MSG_FIELD_ID_NONE) + container.query_match->sort_key = mi.opt_string(sortfield_id).value_or(""); + container.query_match->date_key = mi.opt_string(MU_MSG_FIELD_ID_DATE).value_or(""); + + // 1.B + // For each element in the query_match's References field: + Container* parent_ref_container{}; + for (const auto& ref: mi.references()) { + // grand_-parent -> grand_-parent -> ... -> parent. + + // Find a Container object for the given Query_Match-ID; If it exists, use it; + // otherwise make one with a null Query_Match. + auto ref_container = [&]()->Container* { + auto ref_it = id_table.find(ref); + if (ref_it == id_table.end()) + ref_it = id_table.emplace(ref,Nothing).first; + return &ref_it->second; + }(); + + // Link the References field's Containers together in the order implied + // by the References header. + // * If they are already linked, don't change the existing links. + // + // * Do not add a link if adding that link would introduce a loop: that is, + // before asserting A->B, search down the children of B to see if A is + // reachable, and also search down the children of A to see if B is + // reachable. If either is already reachable as a child of the other, + // don't add the link. + if (parent_ref_container && !ref_container->parent && + !parent_ref_container->is_reachable(ref_container)) + parent_ref_container->add_child(*ref_container); + + parent_ref_container = ref_container; + } + + // Add the query_match to the chain. + if (parent_ref_container && !container.parent && + !parent_ref_container->is_reachable(&container)) { + parent_ref_container->add_child(container); + } + } + + return id_table; +} + +/// Recursively walk all containers under the root set. +/// For each container: +/// +/// If it is an empty container with no children, nuke it. +/// +/// Note: Normally such containers won't occur, but they can show up when two +/// query_matchs have References lines that disagree. For example, assuming A and +/// B are query_matchs, and 1, 2, and 3 are references for query_matchs we haven't +/// seen: +/// +/// A has references: 1, 2, 3 +/// B has references: 1, 3 +/// +/// There is ambiguity as to whether 3 is a child of 1 or of 2. So, +/// depending on the processing order, we might end up with either +/// +/// -- 1 +/// |-- 2 +/// \-- 3 +/// |-- A +/// \-- B +/// +/// or +/// +/// -- 1 +/// |-- 2 <--- non root childless container! +/// \-- 3 +/// |-- A +/// \-- B +/// +/// If the Container has no Query_Match, but does have children, remove this +/// container but promote its children to this level (that is, splice them in +/// to the current child list.) +/// +/// Do not promote the children if doing so would promote them to the root +/// set -- unless there is only one child, in which case, do. + + + +static void +prune_empty_containers (Container& container) +{ + container.for_each_child([](auto&& child){prune_empty_containers(*child);}); + + // Never nuke these. + if (!container.is_empty()) + return; + + if (container.children.empty()) { + // If it is an empty container with no children, nuke it. + if (container.parent) + container.parent->remove_child(container); + container.is_nuked = true; + return; + } + // If the Container is empty, but does have children, remove this + // container but promote its children to this level (that is, splice them in + // to the current child list.) + // + // Do not promote the children if doing so would promote them to the root + // set -- unless there is only one child, in which case, do. + //const auto rootset_child{!container.parent->parent}; + if (container.parent || container.children.size() == 1) { + container.promote_children(); + container.is_nuked = true; + } else if (!container.children.empty()){ + // so an empty container with children. Copy the query info of the first + // child, for sorting -- so the sort key "bubbles up". Renew + // it so the sorting workes out. + auto& first_child{*container.children.begin()}; + container.borrow_query_match(*first_child); + } +} + + +static void +prune_empty_containers (IdTable& id_table) +{ + for (auto&& item: id_table) { + if (!item.second.parent) + prune_empty_containers(item.second); + } +} + + +/// Sorting. +/// +/// We start the sorting from the rout-vec, ie. the set of of parentless conainers. +/// +/// We need to sort the rootset by whatever the sortkey is (subject, date, ...); however under the +/// rotset we stricly sort in ascending order by date. Containers with empty query_matchs have the +/// sort key from the first of their children (recursively). +// +// Note, children are already stored in a (sorted) std::set, based on their date. That's correct for +// all but the top-level (root) containers; so, we just need fix those. +// + +// the root_vec is the sorted vec of top-level (parent-less) containers. +using RootVec = std::vector; +static RootVec +determine_root_vec(IdTable& id_table, bool descending) +{ + RootVec root_vec; + + for (auto&& item: id_table) { + Container* c{&item.second}; + if (!c || !c->query_match || c->parent || c->is_nuked) + continue; + root_vec.emplace_back(c); + } + + std::sort(root_vec.begin(), root_vec.end(), + [&](Container*& c1, Container*& c2)->bool { +#ifdef BUILD_TESTS + if (descending) + return c2->query_match->sort_key < c1->query_match->sort_key; + else + return c1->query_match->sort_key < c2->query_match->sort_key; +#else + // the non-testing case, the "descending" part is handled + // in the "decider" + return c1->query_match->sort_key < c2->query_match->sort_key; +#endif /*BUILD_TESTS*/ + }); + + return root_vec; +} + +static bool +update_container_query_match (Container& container, ThreadPathVec& pvec, + size_t segment_size, bool descending) +{ + if (container.is_empty()) + return false; // nothing to update. + + auto& qmatch{*container.query_match}; + + if (!container.parent) + qmatch.flags |= QueryMatch::Flags::Root; + else if (container.parent->is_empty()) + qmatch.flags |= QueryMatch::Flags::Orphan; + + if (!container.children.empty()) + qmatch.flags |= QueryMatch::Flags::HasChild; + + if (descending && container.parent) { + // trick xapian by giving it "inverse" sorting key so our + // ascending-date sorted threads stay in that order + pvec.back() = ((1U << (4 * segment_size)) - 1) - pvec.back(); + } + + qmatch.thread_path = to_string(pvec, segment_size); + qmatch.thread_level = pvec.size() - 1; + + // ensure thread root comes before its children + if (descending) + qmatch.thread_path += ":z"; + + return true; +} + +static void +sort_siblings (Container::children_type& siblings, + const ThreadPathVec& parent_path_vec, + size_t segment_size, bool descending) +{ + if (siblings.empty()) + return; + else { + const auto first{*siblings.begin()}; + if (first->query_match) + first->query_match->flags |= QueryMatch::Flags::First; + const auto last{*(--siblings.end())}; + if (last->query_match) + last->query_match->flags |= QueryMatch::Flags::Last; + } + + size_t idx{0}; + ThreadPathVec thread_path_vec{parent_path_vec}; + + for (auto&& c: siblings) { + thread_path_vec.emplace_back(idx++); + update_container_query_match (*c, thread_path_vec, segment_size, descending); + if (!c->children.empty()) + sort_siblings (c->children, thread_path_vec, + segment_size, descending); + thread_path_vec.pop_back(); + } +} + + +static void +sort_siblings (IdTable& id_table, bool descending) +{ + if (id_table.empty()) + return; + + auto root_vec{determine_root_vec(id_table, descending)}; // sorted + + //std::cerr << "rvs" << root_vec.size() << "\n"; + + const auto seg_size = static_cast( + std::ceil(std::log2(id_table.size())/4.0)); + /*note: 4 == std::log2(16)*/ + + ThreadPathVec path_vec; + auto idx{0U}; + + for (auto&& c: root_vec) { + path_vec.emplace_back(idx++); + update_container_query_match (*c, path_vec, seg_size, descending); + sort_siblings (c->children, path_vec, seg_size, descending); + path_vec.pop_back(); + } +} + +static std::ostream& +operator<<(std::ostream& os, const IdTable& id_table) +{ + std::set ids; + for (auto&& item: id_table) { + if (item.second.query_match) + ids.emplace(item.second.query_match->thread_path); + } + + for (auto&& id: ids) { + auto it = std::find_if(id_table.begin(), id_table.end(), [&](auto&& item) { + return item.second.query_match && item.second.query_match->thread_path == id; + }); + assert(it != id_table.end()); + os << it->first << ": " << it->second << '\n'; + } + return os; +} + + +template static void +calculate_threads_real (Results& qres, MuMsgFieldId sort_field, + bool descending) +{ + // Step 1: build the id_table + auto id_table{determine_id_table(qres, sort_field)}; + + // // Step 2: get the root set + // // Step 3: discard id_table + // Nope: id-table owns the containers. + // Step 4: prune empty containers + prune_empty_containers(id_table); + + // Step 5: group root-set by subject. + // Not implemented. + + // Step 6: we're done threading + + // Step 7: sort siblings. The segment-size is the number of hex-digits + // in the thread-path string (so we can lexically compare them.) + sort_siblings(id_table, descending); + + if (g_test_verbose()) + std::cout << "*** id-table:\n" << id_table << "\n"; +} + +void +Mu::calculate_threads (Mu::QueryResults& qres, MuMsgFieldId sort_field, + bool descending) +{ + calculate_threads_real(qres, sort_field, descending); +} + +#ifdef BUILD_TESTS + +struct MockQueryResult { + MockQueryResult(const std::string& message_id_arg, + const std::string& sort_key_arg, + const std::string& date_key_arg, + const std::vector& refs_arg={}): + message_id_{message_id_arg}, + sort_key_{sort_key_arg}, + date_key_{date_key_arg}, + refs_{refs_arg} + {} + MockQueryResult(const std::string& message_id_arg, + const std::vector& refs_arg={}): + MockQueryResult(message_id_arg, "", "", refs_arg) {} + Option message_id() const { return message_id_;} + Option path() const { return path_;} + QueryMatch& query_match() { return query_match_;} + const QueryMatch& query_match() const { return query_match_;} + const std::vector& references() const { return refs_;} + + Option opt_string(MuMsgFieldId id) const { + if (id == MU_MSG_FIELD_ID_DATE) + return date_key_; + else + return sort_key_; + } + Option path_{"/"}; + std::string message_id_; + QueryMatch query_match_{}; + std::string sort_key_; + std::string date_key_; + std::vector refs_; +}; + +using MockQueryResults = std::vector; + + +G_GNUC_UNUSED static std::ostream& +operator<<(std::ostream& os, const MockQueryResults& qrs) +{ + for (auto&& mi: qrs) + os << mi.query_match().thread_path << " :: " + << mi.message_id().value_or("") << std::endl; + + return os; +} + +static void +calculate_threads (MockQueryResults& qres, MuMsgFieldId sort_field, + bool descending) +{ + calculate_threads_real(qres, sort_field, descending); +} + +using Expected = std::vector>; + + +static void +assert_thread_paths (MockQueryResults& qrs, const Expected& expected) +{ + for (auto&& exp: expected) { + auto it = std::find_if(qrs.begin(), qrs.end(), [&](auto&& qr){ + return qr.message_id().value_or("") == exp.first; + }); + g_assert_true (it != qrs.end()); + g_assert_cmpstr(exp.second.c_str(), ==, it->query_match().thread_path.c_str()); + } +} + +static void +test_basic() +{ + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m2"} }, + MockQueryResult{ "m2", "b", "2", {"m3"} }, + MockQueryResult{ "m3", "c", "3", {}}, + MockQueryResult{ "m4", "d", "4", {}} + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "m1", "0:0:0"}, + { "m2", "0:0" }, + { "m3", "0" }, + { "m4", "1" } + }); + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, true); + + assert_thread_paths (results, { + { "m1", "1:f:f:z"}, + { "m2", "1:f:z" }, + { "m3", "1:z" }, + { "m4", "0:z" } + }); +} + + +static void +test_prune_empty_containers() +{ + { + // m7 should not be nuked + auto results = MockQueryResults { + MockQueryResult{ "x1", "a", "1", {"m7"} }, + MockQueryResult{ "x2", "b", "2", {"m7"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "x1", "0:0"}, + { "x2", "0:1" }, + }); + } + + { + // m7 should be nuked + + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m7"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "m1", "0"}, + }); + } + + { + // m6 should be nuked + + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m7", "m6"} }, + MockQueryResult{ "m2", "b", "2", {"m7", "m6"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); + + assert_thread_paths (results, { + { "m1", "0:0"}, + { "m2", "0:1" }, + }); + } + + + { + // m6 should be nuked + + auto results = MockQueryResults { + MockQueryResult{ "m1", + "a", "1", + {"m28uszf59m.fsf@damtp.cam.ac.uk", + "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", + "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, + MockQueryResult{ "m2", + "b", "2", + {"m28uszf59m.fsf@damtp.cam.ac.uk", + "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", + "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, + }; + + calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); + + assert_thread_paths (results, { + { "m1", "0:0"}, + { "m2", "0:1" }, + }); + } +} + +static void +test_id_table_inconsistent() +{ + auto results = MockQueryResults { + MockQueryResult{ "m1", "a", "1", {"m2"} }, + MockQueryResult{ "m2", "b", "2", {"m1"} }, + MockQueryResult{ "m3", "c", "3", {"m3"} }, // self ref + MockQueryResult{ "m4", "d", "4", {"m3", "m5"} }, + MockQueryResult{ "m5", "e", "5", {"m4", "m4"} }, // dup parent + }; + + calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); + + assert_thread_paths (results, { + { "m2", "0"}, + { "m1", "0:0" }, + { "m3", "1"}, + { "m5", "1:0" }, + { "m4", "1:0:0"}, + }); +} + +int +main (int argc, char *argv[]) try +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/threader/basic", test_basic); + g_test_add_func ("/threader/prune-empty-containers", test_prune_empty_containers); + g_test_add_func ("/threader/id-table-inconsistent", test_id_table_inconsistent); + + return g_test_run (); + +} catch (const std::runtime_error& re) { + std::cerr << re.what() << "\n"; + return 1; +} catch (...) { + std::cerr << "caught exception\n"; + return 1; +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/mu-query-threads.hh b/lib/mu-query-threads.hh new file mode 100644 index 00000000..b15a7902 --- /dev/null +++ b/lib/mu-query-threads.hh @@ -0,0 +1,44 @@ +/* +** Copyright (C) 2021 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_QUERY_THREADS__ +#define MU_QUERY_THREADS__ + +#include "mu-query-results.hh" + +namespace Mu { +/** + * Calculate the threads for these query results; that is, determine the + * thread-paths for each message, so we can let Xapian order them in the correct + * order. + * + * Note - threads are can be order by an arbitrary field for the top level, but + * the messages below the top level are always sorted in chronologically + * ascending orde + * + * @param qres query results + * @param sort_field the field to sort the top-level by + * @param descending whether to sort the top-level in descending order + */ +void calculate_threads (QueryResults& qres, MuMsgFieldId sort_field, + bool descending); + +} // namespace Mu + +#endif /*MU_QUERY_THREADS__*/ diff --git a/lib/mu-query.cc b/lib/mu-query.cc index 55ce1538..c96a0264 100644 --- a/lib/mu-query.cc +++ b/lib/mu-query.cc @@ -23,19 +23,16 @@ #include #include #include +#include #include #include #include #include "mu-msg-fields.h" - -#include "mu-msg-iter.h" - -#include "utils/mu-str.h" -#include "utils/mu-date.h" -#include - +#include "mu-query-results.hh" +#include "mu-query-match-deciders.hh" +#include "mu-query-threads.hh" #include using namespace Mu; @@ -43,186 +40,29 @@ using namespace Mu; struct Query::Private { Private(const Store& store): store_{store}, parser_{store_} {} + // New + //bool calculate_threads (Xapian::Enquire& enq, size maxnum); - Xapian::Query make_query (const std::string& expr, GError **err) const; - Xapian::Enquire make_enquire (const std::string& expr, MuMsgFieldId sortfieldid, - bool descending, GError **err) const; - GHashTable* find_thread_ids (MuMsgIter *iter, GHashTable **orig_set) const; + Xapian::Enquire make_enquire (const std::string& expr, + MuMsgFieldId sortfieldid, QueryFlags qflags) const; + Xapian::Enquire make_related_enquire (const Xapian::Query& first_q, + const StringSet& thread_ids, + MuMsgFieldId sortfieldid, QueryFlags qflags) const; - Xapian::Query make_related_query (MuMsgIter *iter, GHashTable **orig_set) const; - - void find_related_messages (MuMsgIter **iter, int maxnum, - MuMsgFieldId sortfieldid, Query::Flags flags, - Xapian::Query orig_query) const; + Option run_threaded (QueryResults &qres, Xapian::Enquire& enq, + MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const; + Option run_singular (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const; + Option run_related (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const; + Option run (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const; const Store& store_; const Parser parser_; }; - -static constexpr MuMsgIterFlags -msg_iter_flags (Query::Flags flags) -{ - MuMsgIterFlags iflags{MU_MSG_ITER_FLAG_NONE}; - - if (any_of(flags & Query::Flags::Descending)) - iflags |= MU_MSG_ITER_FLAG_DESCENDING; - if (any_of(flags & Query::Flags::SkipUnreadable)) - iflags |= MU_MSG_ITER_FLAG_SKIP_UNREADABLE; - if (any_of(flags & Query::Flags::SkipDups)) - iflags |= MU_MSG_ITER_FLAG_SKIP_DUPS; - if (any_of(flags & Query::Flags::Threading)) - iflags |= MU_MSG_ITER_FLAG_THREADS; - - return iflags; -} - -Xapian::Query -Query::Private::make_query (const std::string& expr, GError **err) const try { - - Mu::WarningVec warns; - const auto tree{parser_.parse(expr, warns)}; - for (auto&& w: warns) - g_warning ("query warning: %s", to_string(w).c_str()); - - return Mu::xapian_query (tree); - -} catch (...) { - mu_util_g_set_error (err, MU_ERROR_XAPIAN_QUERY, - "parse error in query"); - throw; -} - - -Xapian::Enquire -Query::Private::make_enquire (const std::string& expr, MuMsgFieldId sortfieldid, - bool descending, GError **err) const -{ - Xapian::Enquire enq{store_.database()}; - - try { - if (!expr.empty() && expr != R"("")") - enq.set_query(make_query (expr, err)); - else/* empty or "" means "matchall" */ - enq.set_query(Xapian::Query::MatchAll); - } catch (...) { - mu_util_g_set_error (err, MU_ERROR_XAPIAN_QUERY, "parse error in query"); - throw; - } - - enq.set_cutoff(0,0); - - return enq; -} - -/* - * record all thread-ids for the messages; also 'orig_set' receives all - * original matches (a map msgid-->docid), so we can make sure the - * originals are not seen as 'duplicates' later (when skipping - * duplicates). We want to favor the originals over the related - * messages, when skipping duplicates. - */ -GHashTable* -Query::Private::find_thread_ids (MuMsgIter *iter, GHashTable **orig_set) const -{ - GHashTable *ids; - - ids = g_hash_table_new_full (g_str_hash, g_str_equal, - (GDestroyNotify)g_free, NULL); - *orig_set = g_hash_table_new_full (g_str_hash, g_str_equal, - (GDestroyNotify)g_free, NULL); - - while (!mu_msg_iter_is_done (iter)) { - char *thread_id, *msgid; - unsigned docid; - /* record the thread id for the message */ - if ((thread_id = mu_msg_iter_get_thread_id (iter))) - g_hash_table_insert (ids, thread_id, - GSIZE_TO_POINTER(TRUE)); - /* record the original set */ - docid = mu_msg_iter_get_docid(iter); - if (docid != 0 && (msgid = mu_msg_iter_get_msgid (iter))) - g_hash_table_insert (*orig_set, msgid, - GSIZE_TO_POINTER(docid)); - - if (!mu_msg_iter_next (iter)) - break; - } - - return ids; -} - - -Xapian::Query -Query::Private::make_related_query (MuMsgIter *iter, GHashTable **orig_set) const -{ - GHashTable *hash; - GList *id_list, *cur; - std::vector qvec; - static std::string pfx (1, mu_msg_field_xapian_prefix - (MU_MSG_FIELD_ID_THREAD_ID)); - - /* orig_set receives the hash msgid->docid of the set of - * original matches */ - hash = find_thread_ids (iter, orig_set); - /* id_list now gets a list of all thread-ids seen in the query - * results; either in the Message-Id field or in - * References. */ - id_list = g_hash_table_get_keys (hash); - - // now, we create a vector with queries for each of the - // thread-ids, which we combine below. This is /much/ faster - // than creating the query as 'query = Query (OR, query)'... - for (cur = id_list; cur; cur = g_list_next(cur)) - qvec.push_back (Xapian::Query((std::string - (pfx + (char*)cur->data)))); - - g_hash_table_destroy (hash); - g_list_free (id_list); - - return Xapian::Query (Xapian::Query::OP_OR, qvec.begin(), qvec.end()); -} - - -void -Query::Private::find_related_messages (MuMsgIter **iter, int maxnum, - MuMsgFieldId sortfieldid, Query::Flags flags, - Xapian::Query orig_query) const -{ - GHashTable *orig_set; - Xapian::Enquire enq{store_.database()}; - MuMsgIter *rel_iter; - const bool inc_related{any_of(flags & Query::Flags::IncludeRelated)}; - - orig_set = NULL; - Xapian::Query new_query{make_related_query (*iter, &orig_set)}; - /* If related message are not desired, filter out messages which would not - have matched the original query. - */ - if (!inc_related) - new_query = Xapian::Query (Xapian::Query::OP_AND, orig_query, new_query); - enq.set_query(new_query); - enq.set_cutoff(0,0); - - rel_iter= mu_msg_iter_new ( - reinterpret_cast(&enq), - maxnum, - sortfieldid, - msg_iter_flags (flags), - NULL); - - mu_msg_iter_destroy (*iter); - - // set the preferred set for the iterator (ie., the set of - // messages not considered to be duplicates) to be the - // original matches -- the matches without considering - // 'related' - mu_msg_iter_set_preferred (rel_iter, orig_set); - g_hash_table_destroy (orig_set); - - *iter = rel_iter; -} - Query::Query(const Store& store): priv_{std::make_unique(store)} {} @@ -232,66 +72,170 @@ Query::Query(Query&& other) = default; Query::~Query() = default; -MuMsgIter* -Query::run (const std::string& expr, MuMsgFieldId sortfieldid, Query::Flags flags, - size_t maxnum, GError **err) const +static Xapian::Enquire& +maybe_sort (Xapian::Enquire& enq, MuMsgFieldId sortfieldid, QueryFlags qflags) { - g_return_val_if_fail (mu_msg_field_id_is_valid (sortfieldid) || - sortfieldid == MU_MSG_FIELD_ID_NONE, - NULL); - try { - MuMsgIter *iter; - const bool threads = any_of(flags & Flags::Threading); - const bool inc_related = any_of(flags & Flags::IncludeRelated); - const bool descending = any_of(flags & Flags::Descending); - Xapian::Enquire enq (priv_->make_enquire(expr, sortfieldid, descending, err)); + if (sortfieldid != MU_MSG_FIELD_ID_NONE) + enq.set_sort_by_value(static_cast(sortfieldid), + any_of(qflags & QueryFlags::Descending)); + return enq; +} - /* when we're doing a 'include-related query', wea're actually - * doing /two/ queries; one to get the initial matches, and - * based on that one to get all messages in threads in those - * matches. - */ +Xapian::Enquire +Query::Private::make_enquire (const std::string& expr, + MuMsgFieldId sortfieldid, QueryFlags qflags) const +{ + Xapian::Enquire enq{store_.database()}; - /* get the 'real' maxnum if it was specified as < 0 */ - maxnum = maxnum == 0 ? priv_->store_.size(): maxnum; - /* Calculating threads involves two queries, so do the calculation only in - * the second query instead of in both. - */ - Query::Flags first_flags{}; - if (threads) - first_flags = flags & ~Flags::Threading; - else - first_flags = flags; - /* Perform the initial query, returning up to max num results. - */ - iter = mu_msg_iter_new ( - reinterpret_cast(&enq), - maxnum, - sortfieldid, - msg_iter_flags (first_flags), - err); - /* If we want threads or related messages, find related messages using a - * second query based on the message ids / refs of the first query's result. - * Do this even if we don't want to include related messages in the final - * result so we can apply the threading algorithm to the related message set - * of a maxnum-sized result instead of the unbounded result of the first - * query. If threads are desired but related message are not, we will remove - * the undesired related messages later. - */ - if(threads||inc_related) - priv_->find_related_messages (&iter, maxnum, sortfieldid, flags, - enq.get_query()); + if (expr.empty() || expr == R"("")") + enq.set_query(Xapian::Query::MatchAll); + else { + WarningVec warns; + const auto tree{parser_.parse(expr, warns)}; + for (auto&& w: warns) + g_warning ("query warning: %s", to_string(w).c_str()); + enq.set_query(xapian_query(tree)); + } - return iter; + return maybe_sort (enq, sortfieldid, qflags); +} - } MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN (err, MU_ERROR_XAPIAN, 0); + +Xapian::Enquire +Query::Private::make_related_enquire (const Xapian::Query& first_q, + const StringSet& thread_ids, + MuMsgFieldId sortfieldid, QueryFlags qflags) const +{ + Xapian::Enquire enq{store_.database()}; + static std::string pfx (1, mu_msg_field_xapian_prefix(MU_MSG_FIELD_ID_THREAD_ID)); + + std::vector qvec{first_q}; + for (auto&& t: thread_ids) + qvec.emplace_back(pfx + t); + Xapian::Query qr{Xapian::Query::OP_OR, qvec.begin(), qvec.end()}; + enq.set_query(qr); + + return maybe_sort (enq, sortfieldid, qflags); + +} + +struct ThreadKeyMaker: public Xapian::KeyMaker { + ThreadKeyMaker (const QueryMatches& matches): + match_info_(matches) + {} + std::string operator()(const Xapian::Document &doc) const override { + const auto it{match_info_.find(doc.get_docid())}; + if (G_UNLIKELY(it == match_info_.end())) { + g_warning("can't find document %u", doc.get_docid()); + return ""; + } + return it->second.thread_path; + } + const QueryMatches& match_info_; +}; + +Option +Query::Private::run_threaded (QueryResults &qres, Xapian::Enquire& enq, + MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const +{ + const auto descending{any_of(qflags & QueryFlags::Descending)}; + + calculate_threads(qres, sortfieldid, descending); + + ThreadKeyMaker key_maker{qres.query_matches()}; + enq.set_sort_by_key(&key_maker, descending); + + DeciderInfo minfo; + minfo.matches = qres.query_matches(); + auto mset{enq.get_mset(0, maxnum, {}, make_final_decider(qflags, minfo).get())}; + + return QueryResults{mset, std::move(qres.query_matches())}; +} + + +Option +Query::Private::run_singular (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const +{ + const auto singular_qflags{qflags | QueryFlags::Leader}; + const auto threading{any_of(qflags & QueryFlags::Threading)}; + + DeciderInfo minfo{}; + auto enq{make_enquire(expr, threading ? MU_MSG_FIELD_ID_NONE : sortfieldid, qflags)}; + auto mset{enq.get_mset(0, maxnum, {}, make_leader_decider(singular_qflags, minfo).get())}; + + auto qres{QueryResults{mset, std::move(minfo.matches)}}; + if (none_of(qflags & QueryFlags::Threading)) + return qres; + else + return run_threaded(qres, enq, sortfieldid, qflags, maxnum); +} + + +Option +Query::Private::run_related (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const +{ + const auto leader_qflags{qflags | QueryFlags::Leader | QueryFlags::GatherThreadIds}; + const auto threading{any_of(qflags & QueryFlags::Threading)}; + + // Run our first, "leader" query; + DeciderInfo minfo{}; + auto enq{make_enquire(expr, MU_MSG_FIELD_ID_NONE, qflags)}; + const auto mset{enq.get_mset(0, maxnum, {}, + make_leader_decider(leader_qflags, minfo).get())}; + + // Now, determine the "related query" + auto r_enq{make_related_enquire(enq.get_query(), minfo.thread_ids, + threading ? MU_MSG_FIELD_ID_NONE :sortfieldid, qflags)}; + const auto r_mset{r_enq.get_mset(0, maxnum, {}, make_related_decider(qflags, minfo).get())}; + + auto qres{QueryResults{r_mset, std::move(minfo.matches)}}; + if (none_of(qflags & QueryFlags::Threading)) + return qres; + else + return run_threaded(qres, r_enq, sortfieldid, qflags, maxnum); +} + +Option +Query::Private::run (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const +{ + const auto eff_maxnum{maxnum == 0 ? store_.size() : maxnum}; + + if (any_of(qflags & QueryFlags::IncludeRelated)) + return run_related (expr, sortfieldid, qflags, eff_maxnum); + else + return run_singular(expr, sortfieldid, qflags, eff_maxnum); +} + + +Option +Query::run (const std::string& expr, MuMsgFieldId sortfieldid, + QueryFlags qflags, size_t maxnum) const try +{ + // some flags are for internal use only. + g_return_val_if_fail (none_of(qflags & QueryFlags::Leader), Nothing); + g_return_val_if_fail (none_of(qflags & QueryFlags::GatherThreadIds), Nothing); + + StopWatch sw{format("query '%s'; related: %s; threads: %s; max-size: %zu", + expr.c_str(), + any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no", + any_of(qflags & QueryFlags::Threading) ? "yes" : "no", + maxnum)}; + + return priv_->run(expr, sortfieldid, qflags, maxnum); + +} catch (...) { + return Nothing; } size_t Query::count (const std::string& expr) const try { - const auto enq{priv_->make_enquire(expr, MU_MSG_FIELD_ID_NONE, false, nullptr)}; + const auto enq{priv_->make_enquire(expr, MU_MSG_FIELD_ID_NONE, {})}; auto mset{enq.get_mset(0, priv_->store_.size())}; mset.fetch(); @@ -302,24 +246,15 @@ Query::count (const std::string& expr) const try std::string -Query::parse(const std::string& expr, bool xapian) const try +Query::parse (const std::string& expr, bool xapian) const { - if (xapian) { - GError *err{}; - const auto descr{priv_->make_query(expr, &err).get_description()}; - if (err) { - g_warning ("query error: %s", err->message); - g_clear_error(&err); - } - return descr; - } else { - Mu::WarningVec warns; - const auto tree = priv_->parser_.parse (expr, warns); - for (auto&& w: warns) - g_warning ("query error: %s", to_string(w).c_str()); + WarningVec warns; + const auto tree{priv_->parser_.parse(expr, warns)}; + for (auto&& w: warns) + g_warning ("query warning: %s", to_string(w).c_str()); + if (xapian) + return xapian_query(tree).get_description(); + else return to_string(tree); - - } - -} MU_XAPIAN_CATCH_BLOCK_RETURN(""); +} diff --git a/lib/mu-query.hh b/lib/mu-query.hh index 8c25cdf4..d40b1417 100644 --- a/lib/mu-query.hh +++ b/lib/mu-query.hh @@ -24,9 +24,10 @@ #include #include -#include +#include #include + namespace Mu { class Query { @@ -52,39 +53,10 @@ public: Query(Query&& other); - enum struct Flags { - None = 0, /**< no flags */ - Descending = 1 << 0, /**< sort z->a */ - SkipUnreadable = 1 << 1, /**< skip unreadable msgs */ - SkipDups = 1 << 2, /**< skip duplicate msgs */ - IncludeRelated = 1 << 3, /**< include related msgs */ - Threading = 1 << 4, /**< calculate threading info */ - }; - - - /** - * run a query; for the syntax, please refer to the mu-query manpage - * - * @param expr the search expression; use "" to match all messages - * @param sortfield the field id to sort by or MU_MSG_FIELD_ID_NONE if - * sorting is not desired - * @param flags bitwise OR'd flags to influence the query (see MuQueryFlags) - * @param maxnum maximum number of search results to return, or 0 for - * unlimited - * @param err receives error information (if there is any); if - * function returns non-NULL, err will _not_be set. err can be NULL - * possible error (err->code) is MU_ERROR_QUERY, - * - * @return a MuMsgIter instance you can iterate over, or NULL in - * case of error - */ - MuMsgIter* run (const std::string& expr="", - MuMsgFieldId sortfieldid=MU_MSG_FIELD_ID_NONE, - Flags flags=Flags::None, - size_t maxnum=0, - GError **err=nullptr) const - G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; - + Option run(const std::string& expr="", + MuMsgFieldId sortfieldid=MU_MSG_FIELD_ID_NONE, + QueryFlags flags=QueryFlags::None, + size_t maxnum=0) const; /** * run a Xapian query to count the number of matches; for the syntax, please @@ -107,14 +79,11 @@ public: * @return the string representation of the query */ std::string parse (const std::string& expr, bool xapian) const; - private: struct Private; std::unique_ptr priv_; }; -MU_ENABLE_BITOPS(Query::Flags); - } #endif /*__MU_QUERY_HH__*/ diff --git a/lib/mu-threader.cc b/lib/mu-threader.cc deleted file mode 100644 index 17b81fdf..00000000 --- a/lib/mu-threader.cc +++ /dev/null @@ -1,455 +0,0 @@ -/* -** Copyright (C) 2012-2020 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify it -** under the terms of the GNU General Public License as published by the -** Free Software Foundation; either version 3, or (at your option) any -** later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ -#include "mu-threader.hh" - -#include /* for log, ceil */ -#include /* for memset */ - -#include "mu-container.hh" -#include "utils/mu-str.h" - -/* msg threading implementation based on JWZ's algorithm, as described in: - * http://www.jwz.org/doc/threading.html - * - * the implementation follows the terminology from that doc, so should - * be understandable from that... I did change things a bit though - * - * the end result of the threading operation is a hashtable which maps - * docids (ie., Xapian documents == messages) to 'thread paths'; a - * thread path is a string denoting the 2-dimensional place of a - * message in a list of messages, - * - * Msg1 => 00000 - * Msg2 => 00001 - * Msg3 (child of Msg2) => 00001:00000 - * Msg4 (child of Msg2) => 00001:00001 - * Msg5 (child of Msg4) => 00001:00001:00000 - * Msg6 => 00002 - * - * the padding-0's are added to make them easy to sort using strcmp; - * the number hexadecimal numbers, and the length of the 'segments' - * (the parts separated by the ':') is equal to ceil(log_16(matchnum)) - * - */ - -/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter); -/* step 2 */ static MuContainer *find_root_set (GHashTable *ids); -static MuContainer* prune_empty_containers (MuContainer *root); -/* static void group_root_set_by_subject (GSList *root_set); */ -GHashTable* create_doc_id_thread_path_hash (MuContainer *root, - size_t match_num); - -/* msg threading algorithm, based on JWZ's algorithm, - * http://www.jwz.org/doc/threading.html */ -GHashTable* -mu_threader_calculate (MuMsgIter *iter, size_t matchnum, - MuMsgFieldId sortfield, gboolean descending) -{ - GHashTable *id_table, *thread_ids; - MuContainer *root_set; - - g_return_val_if_fail (iter, FALSE); - g_return_val_if_fail (mu_msg_field_id_is_valid (sortfield) || - sortfield == MU_MSG_FIELD_ID_NONE, - FALSE); - - /* step 1 */ - id_table = create_containers (iter); - if (matchnum == 0) - return id_table; /* just return an empty table */ - - /* step 2 -- the root_set is the list of children without parent */ - root_set = find_root_set (id_table); - - /* step 3: skip until the end; we still need to containers */ - - /* step 4: prune empty containers */ - root_set = prune_empty_containers (root_set); - - /* sort root set */ - if (sortfield != MU_MSG_FIELD_ID_NONE) - root_set = mu_container_sort (root_set, sortfield, descending, - NULL); - - /* step 5: group root set by subject */ - /* group_root_set_by_subject (root_set); */ - - /* sort */ - mu_msg_iter_reset (iter); /* go all the way back */ - - /* finally, deliver the docid => thread-path hash */ - thread_ids = mu_container_thread_info_hash_new (root_set, - matchnum); - - g_hash_table_destroy (id_table); /* step 3*/ - - return thread_ids; -} - -G_GNUC_UNUSED static void -check_dup (const char *msgid, MuContainer *c, GHashTable *hash) -{ - if (g_hash_table_lookup (hash, c)) { - g_warning ("ALREADY!!"); - mu_container_dump (c, FALSE); - g_assert (0); - } else - g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE)); -} - - -G_GNUC_UNUSED static void -assert_no_duplicates (GHashTable *ids) -{ - GHashTable *hash; - - hash = g_hash_table_new (g_direct_hash, g_direct_equal); - - g_hash_table_foreach (ids, (GHFunc)check_dup, hash); - - g_hash_table_destroy (hash); -} - - -/* a referred message is a message that is referred by some other - * message */ -static MuContainer* -find_or_create_referred (GHashTable *id_table, const char *msgid, - gboolean *created) -{ - MuContainer *c; - - g_return_val_if_fail (msgid, NULL); - - c = (MuContainer*)g_hash_table_lookup (id_table, msgid); - *created = !c; - if (!c) { - c = mu_container_new (NULL, 0, msgid); - g_hash_table_insert (id_table, (gpointer)msgid, c); - /* assert_no_duplicates (id_table); */ - } - - - return c; -} - -/* find a container for the given msgid; if it does not exist yet, - * create a new one, and register it */ -static MuContainer* -find_or_create (GHashTable *id_table, MuMsg *msg, guint docid) -{ - MuContainer *c; - const char* msgid; - char fake[32]; - - g_return_val_if_fail (msg, NULL); - g_return_val_if_fail (docid != 0, NULL); - - msgid = mu_msg_get_msgid (msg); - if (!msgid) - msgid = mu_msg_get_path (msg); /* fake it */ - if (!msgid) { /* no path either? seems to happen... */ - g_warning ("message without path"); - g_snprintf (fake, sizeof(fake), "fake:%p", (gpointer)msg); - msgid = fake; - } - - /* XXX the '' works around a crash; find a better - * solution */ - c = (MuContainer*)g_hash_table_lookup (id_table, msgid); - - /* If id_table contains an empty MuContainer for this ID: * * - * Store this message in the MuContainer's message slot. */ - if (c) { - if (!c->msg) { - c->msg = mu_msg_ref (msg); - c->docid = docid; - return c; - } else { - /* special case, not in the JWZ algorithm: the - * container exists already and has a message; this - * means that we are seeing *another message* with a - * message-id we already saw... create this message, - * and mark it as a duplicate, and a child of the one - * we saw before; use its path as a fake message-id - * */ - MuContainer *c2; - const char* fake_msgid; - - fake_msgid = mu_msg_get_path (msg); - - c2 = mu_container_new (msg, docid, fake_msgid); - c2->flags = MU_CONTAINER_FLAG_DUP; - /*c = */ mu_container_append_children (c, c2); - - g_hash_table_insert (id_table, (gpointer)fake_msgid, c2); - - return NULL; /* don't process this message further */ - } - } else { /* Else: Create a new MuContainer object holding - this message; Index the MuContainer by - Message-ID in id_table. */ - c = mu_container_new (msg, docid, msgid); - g_hash_table_insert (id_table, (gpointer)msgid, c); - /* assert_no_duplicates (id_table); */ - - return c; - } -} - -static gboolean -child_elligible (MuContainer *parent, MuContainer *child, gboolean created) -{ - if (!parent || !child) - return FALSE; - if (child->parent) - return FALSE; - /* if (created) */ - /* return TRUE; */ - if (mu_container_reachable (parent, child)) - return FALSE; - if (mu_container_reachable (child, parent)) - return FALSE; - - return TRUE; -} - - - -static void /* 1B */ -handle_references (GHashTable *id_table, MuContainer *c) -{ - const GSList *refs, *cur; - MuContainer *parent; - gboolean created; - - refs = mu_msg_get_references (c->msg); - if (!refs) - return; /* nothing to do */ - - /* For each element in the message's References field: - - Find a MuContainer object for the given Message-ID: If - there's one in id_table use that; Otherwise, make (and - index) one with a null Message. */ - - /* go over over our list of refs, until 1 before the last... */ - created = FALSE; - for (parent = NULL, cur = refs; cur; cur = g_slist_next (cur)) { - - MuContainer *child; - child = find_or_create_referred (id_table, (gchar*)cur->data, - &created); - - /* if we find the current message in their own refs, break now - so that parent != c in next step */ - if (child == c) - break; - - /*Link the References field's MuContainers together in - * the order implied by the References header. - - If they are already linked, don't change the existing - links. Do not add a link if adding that link would - introduce a loop: that is, before asserting A->B, - search down the children of B to see if A is - reachable, and also search down the children of A to - see if B is reachable. If either is already reachable - as a child of the other, don't add the link. */ - - if (child_elligible (parent, child, created)) - /*parent =*/ - mu_container_append_children (parent, child); - - parent = child; - } - - /* 'parent' points to the last ref: our direct parent; - - Set the parent of this message to be the last element in - References. Note that this message may have a parent - already: this can happen because we saw this ID in a - References field, and presumed a parent based on the other - entries in that field. Now that we have the actual message, - we can be more definitive, so throw away the old parent and - use this new one. Find this MuContainer in the parent's - children list, and unlink it. - - Note that this could cause this message to now have no - parent, if it has no references field, but some message - referred to it as the non-first element of its - references. (Which would have been some kind of lie...) - - Note that at all times, the various ``parent'' and ``child'' fields - must be kept inter-consistent. */ - - /* optimization: if the the message was newly added, it's by - definition not reachable yet */ - - /* So, we move c and its descendants to become a child of parent if: - * both are not NULL - * parent is not a descendant of c. - * both are different from each other (guaranteed in last loop) */ - - if (parent && c && !(c->child && mu_container_reachable (c->child, parent))) { - - /* if c already has a parent, remove c from its parent children - and reparent it, as now we know who is c's parent reliably */ - if (c->parent) { - mu_container_remove_child(c->parent, c); - c->next = c->last = c->parent = NULL; - } - - /*parent = */mu_container_append_children (parent, c); - } -} - - - -/* step 1: create the containers, connect them, and fill the id_table */ -static GHashTable* -create_containers (MuMsgIter *iter) -{ - GHashTable *id_table; - id_table = g_hash_table_new_full (g_str_hash, g_str_equal, - NULL, - (GDestroyNotify)mu_container_destroy); - - for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter); - mu_msg_iter_next (iter)) { - - MuContainer *c; - MuMsg *msg; - unsigned docid; - - /* 1.A */ - msg = mu_msg_iter_get_msg_floating (iter); /* don't unref */ - docid = mu_msg_iter_get_docid (iter); - - c = find_or_create (id_table, msg, docid); - - /* 1.B and C */ - if (c) - handle_references (id_table, c); - } - - return id_table; -} - - - -static void -filter_root_set (const gchar *msgid, MuContainer *c, MuContainer **root_set) -{ - /* ignore children */ - if (c->parent) - return; - - /* ignore duplicates */ - if (c->flags & MU_CONTAINER_FLAG_DUP) - return; - - if (*root_set == NULL) { - *root_set = c; - return; - } else - *root_set = mu_container_append_siblings (*root_set, c); -} - - -/* 2. Walk over the elements of id_table, and gather a list of the - MuContainer objects that have no parents, but do have children */ -static MuContainer* -find_root_set (GHashTable *ids) -{ - MuContainer *root_set; - - root_set = NULL; - g_hash_table_foreach (ids, (GHFunc)filter_root_set, &root_set); - - return root_set; -} - - -static gboolean -prune_maybe (MuContainer *c) -{ - MuContainer *cur; - - for (cur = c->child; cur; cur = cur->next) { - if (cur->flags & MU_CONTAINER_FLAG_DELETE) { - c = mu_container_remove_child (c, cur); - } else if (cur->flags & MU_CONTAINER_FLAG_SPLICE) { - c = mu_container_splice_grandchildren (c, cur); - c = mu_container_remove_child (c, cur); - } - } - - g_return_val_if_fail (c, FALSE); - - /* don't touch containers with messages */ - if (c->msg) - return TRUE; - - /* A. If it is an msg-less container with no children, mark it for - * deletion. */ - if (!c->child) { - c->flags |= MU_CONTAINER_FLAG_DELETE; - return TRUE; - } - - /* B. If the MuContainer has no Message, but does have - * children, remove this container but promote its - * children to this level (that is, splice them in to - * the current child list.) - * - * Do not promote the children if doing so would - * promote them to the root set -- unless there is - * only one child, in which case, do. - */ - if (c->child->next) /* ie., > 1 child */ - return TRUE; - - c->flags |= MU_CONTAINER_FLAG_SPLICE; - - return TRUE; -} - - -static MuContainer* -prune_empty_containers (MuContainer *root_set) -{ - MuContainer *cur; - - mu_container_foreach (root_set, - (MuContainerForeachFunc)prune_maybe, - NULL); - - /* and prune the root_set itself... */ - for (cur = root_set; cur; cur = cur->next) { - if (cur->flags & MU_CONTAINER_FLAG_DELETE) { - root_set = mu_container_remove_sibling (root_set, cur); - } else if (cur->flags & MU_CONTAINER_FLAG_SPLICE) { - root_set = mu_container_splice_children (root_set, cur); - root_set = mu_container_remove_sibling (root_set, cur); - } - } - - return root_set; -} diff --git a/lib/mu-threader.hh b/lib/mu-threader.hh deleted file mode 100644 index 3603aeda..00000000 --- a/lib/mu-threader.hh +++ /dev/null @@ -1,49 +0,0 @@ -/* -** Copyright (C) 2012-2020 Dirk-Jan C. Binnema -** -** This program is free software; you can redistribute it and/or modify it -** under the terms of the GNU General Public License as published by the -** Free Software Foundation; either version 3, or (at your option) any -** later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** -*/ - -#ifndef MU_THREADER_HH__ -#define MU_THREADER_HH__ - -#include -#include - -/** - * takes an iter and the total number of matches, and from this - * generates a hash-table with information about the thread structure - * of these matches. - * - * the algorithm to find this structure is based on JWZ's - * message-threading algorithm, as descrbed in: - * http://www.jwz.org/doc/threading.html - * - * the returned hashtable maps the Xapian docid of iter (msg) to a ptr - * to a MuMsgIterThreadInfo structure (see mu-msg-iter.h) - * - * @param iter an iter; note this function will mu_msgi_iter_reset this iterator - * @param matches the number of matches in the set * - * @param sortfield the field to sort results by, or - * MU_MSG_FIELD_ID_NONE if no sorting should be performed - * @param revert if TRUE, if revert the sorting order - * - * @return a hashtable; free with g_hash_table_destroy when done with it - */ -GHashTable *mu_threader_calculate (MuMsgIter *iter, size_t matches, - MuMsgFieldId sortfield, gboolean revert); - -#endif /*MU_THREADER_HH__*/ diff --git a/lib/test-query.cc b/lib/test-query.cc new file mode 100644 index 00000000..ce2c82bc --- /dev/null +++ b/lib/test-query.cc @@ -0,0 +1,91 @@ +/* +** Copyright (C) 2020 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ +#include + +#include +#include + +#include +#include +#include + +#include "mu-store.hh" +#include "mu-query.hh" +#include "index/mu-indexer.hh" +#include "utils/mu-utils.hh" +#include "test-mu-common.hh" + +using namespace Mu; + +static void +test_query() +{ + allow_warnings(); + + Store store{test_mu_common_get_random_tmpdir(), std::string{MU_TESTMAILDIR}, {},{}}; + auto&& idx{store.indexer()}; + + g_assert_true (idx.start(Indexer::Config{})); + while (idx.is_running()) { + sleep(1); + } + + auto dump_matches=[](const QueryResults& res) { + size_t n{}; + for (auto&& item: res) + g_debug ("%02zu %s %s", ++n, item.path().value_or("").c_str(), + item.message_id().value_or("").c_str()); + }; + + + Query q{store}; + g_assert_cmpuint(store.size(),==,19); + + { + const auto res = q.run("", MU_MSG_FIELD_ID_NONE, QueryFlags::None); + g_assert_true(!!res); + g_assert_cmpuint(res->size(),==,19); + dump_matches(*res); + } + + { + const auto res = q.run("", MU_MSG_FIELD_ID_PATH, QueryFlags::None, 11); + g_assert_true(!!res); + g_assert_cmpuint(res->size(),==,11); + dump_matches(*res); + } +} + +int +main (int argc, char *argv[]) try +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/query", test_query); + + return g_test_run (); + + +} catch (const std::runtime_error& re) { + std::cerr << re.what() << "\n"; + return 1; +} catch (...) { + std::cerr << "caught exception\n"; + return 1; +} diff --git a/mu/test-mu-query.cc b/mu/test-mu-query.cc index fc281105..1e17c435 100644 --- a/mu/test-mu-query.cc +++ b/mu/test-mu-query.cc @@ -17,9 +17,10 @@ ** */ -#ifdef HAVE_CONFIG_H #include "config.h" -#endif /*HAVE_CONFIG_H*/ + +#include +#include #include #include @@ -69,36 +70,28 @@ make_database (const std::string& testdir) static void -assert_no_dups (MuMsgIter *iter) +assert_no_dups (const QueryResults& qres) { - GHashTable *hash; + std::unordered_set msgid_set, path_set; - hash = g_hash_table_new_full (g_str_hash, g_str_equal, - (GDestroyNotify)g_free, NULL); + for (auto&& mi: qres) { + g_assert_true(msgid_set.find(mi.message_id().value()) == msgid_set.end()); + g_assert_true(path_set.find(mi.path().value()) == path_set.end()); - mu_msg_iter_reset (iter); - while (!mu_msg_iter_is_done(iter)) { - MuMsg *msg; - msg = mu_msg_iter_get_msg_floating (iter); - /* make sure there are no duplicates */ - g_assert (!g_hash_table_lookup (hash, mu_msg_get_path (msg))); - g_hash_table_insert (hash, g_strdup (mu_msg_get_path(msg)), - GUINT_TO_POINTER(TRUE)); - mu_msg_iter_next (iter); - } - mu_msg_iter_reset (iter); - g_hash_table_destroy (hash); + path_set.emplace(*mi.path()); + msgid_set.emplace(*mi.message_id()); + + g_assert_false(msgid_set.find(mi.message_id().value()) == msgid_set.end()); + g_assert_false(path_set.find(mi.path().value()) == path_set.end()); + } } /* note: this also *moves the iter* */ static guint run_and_count_matches (const std::string& xpath, const std::string& expr, - Mu::Query::Flags flags = Mu::Query::Flags::None) + Mu::QueryFlags flags = Mu::QueryFlags::None) { - MuMsgIter *iter; - guint count1, count2; - Mu::Store store{xpath}; Mu::Query query{store}; @@ -109,22 +102,15 @@ run_and_count_matches (const std::string& xpath, const std::string& expr, Mu::allow_warnings(); - iter = query.run (expr, MU_MSG_FIELD_ID_NONE, flags); - g_assert (iter); - assert_no_dups (iter); + auto qres{query.run (expr, MU_MSG_FIELD_ID_NONE, flags)}; + g_assert_true (!!qres); + assert_no_dups (*qres); - /* run query twice, to test mu_msg_iter_reset */ - for (count1 = 0; !mu_msg_iter_is_done(iter); - mu_msg_iter_next(iter), ++count1); + int count1{0}; + for (auto&& it: *qres) ++count1; - mu_msg_iter_reset (iter); - - assert_no_dups (iter); - - for (count2 = 0; !mu_msg_iter_is_done(iter); - mu_msg_iter_next(iter), ++count2); - - mu_msg_iter_destroy (iter); + int count2{0}; + for (auto&& it: *qres) ++count2; g_assert_cmpuint (count1, ==, count2); @@ -261,26 +247,23 @@ test_mu_query_logic (void) ==, queries[i].count); } - - - static void test_mu_query_accented_chars_01 (void) { - MuMsgIter *iter; - MuMsg *msg; GError *err; gchar *summ; Store store{DB_PATH1}; Query q{store}; - iter = q.run("fünkÿ"); - err = NULL; - msg = mu_msg_iter_get_msg_floating (iter); /* don't unref */ + auto qres{q.run("fünkÿ")}; + g_assert_true(!!qres); + g_assert_false(qres->empty()); + + auto begin{qres->begin()}; + auto msg{begin.floating_msg()}; if (!msg) { - g_warning ("error getting message: %s", err->message); - g_error_free (err); + g_warning ("error getting message"); g_assert_not_reached (); } @@ -293,8 +276,6 @@ test_mu_query_accented_chars_01 (void) g_assert_cmpstr (summ,==, "Let's write some fünkÿ text using umlauts. Foo."); g_free (summ); - - mu_msg_iter_destroy (iter); } static void @@ -629,7 +610,7 @@ test_mu_query_threads_compilation_error (void) g_assert_cmpuint (run_and_count_matches (xpath, "msgid:uwsireh25.fsf@one.dot.net", - Query::Flags::IncludeRelated), + QueryFlags::IncludeRelated), ==, 3); } diff --git a/mu/test-mu-threads.cc b/mu/test-mu-threads.cc index 94eb05a1..d979ca34 100644 --- a/mu/test-mu-threads.cc +++ b/mu/test-mu-threads.cc @@ -122,25 +122,25 @@ make_database (const std::string& testdir) /* note: this also *moves the iter* */ -static MuMsgIter* -run_and_get_iter_full (const std::string& xpath, const std::string& expr, - MuMsgFieldId sort_field, - Mu::Query::Flags flags=Mu::Query::Flags::None) +static QueryResults +run_and_get_results_full (const std::string& xpath, const std::string& expr, + MuMsgFieldId sort_field, + Mu::QueryFlags flags=Mu::QueryFlags::None) { Mu::Store store{xpath}; Mu::Query q{store}; - const auto myflags{flags | Mu::Query::Flags::Threading}; - auto iter = q.run (expr, sort_field, myflags); - g_assert (iter); + const auto myflags{flags | Mu::QueryFlags::Threading}; + auto res = q.run (expr, sort_field, myflags); + g_assert_true(!!res); - return iter; + return std::move(res.value()); } -static MuMsgIter* -run_and_get_iter (const std::string& xpath, const char *query) +static QueryResults +run_and_get_results (const std::string& xpath, const char *query) { - return run_and_get_iter_full (xpath, query, MU_MSG_FIELD_ID_DATE); + return run_and_get_results_full (xpath, query, MU_MSG_FIELD_ID_DATE); } static void @@ -166,12 +166,11 @@ test_mu_threads_01 (void) const auto xpath{make_database(MU_TESTMAILDIR3)}; g_assert (!xpath.empty()); - auto iter = run_and_get_iter (xpath, "abc"); - g_assert (iter); - g_assert (!mu_msg_iter_is_done(iter)); + auto res{run_and_get_results (xpath, "abc")}; + g_assert_false(res.empty()); - foreach_assert_tinfo_equal (iter, items, G_N_ELEMENTS (items)); - mu_msg_iter_destroy (iter); +#waning fixme + //foreach_assert_tinfo_equal (iter, items, G_N_ELEMENTS (items)); } static void @@ -197,9 +196,8 @@ test_mu_threads_rogue (void) const auto xpath{make_database (MU_TESTMAILDIR3)}; g_assert_false (xpath.empty()); - iter = run_and_get_iter (xpath, "def"); - g_assert (iter); - g_assert (!mu_msg_iter_is_done(iter)); + auto res{run_and_get_results (xpath, "def")}; + g_assert_false(res.empty()); /* due to the random order in files can be indexed, there are two possible ways * for the threads to be built-up; both are okay */ @@ -209,14 +207,13 @@ test_mu_threads_rogue (void) else items = items2; - foreach_assert_tinfo_equal (iter, items, G_N_ELEMENTS (items1)); - mu_msg_iter_destroy (iter); + //foreach_assert_tinfo_equal (iter, items, G_N_ELEMENTS (items1)); } static MuMsgIter* query_testdir (const char *query, MuMsgFieldId sort_field, gboolean descending) { - const auto flags{descending ? Query::Flags::Descending : Query::Flags::None}; + const auto flags{descending ? QueryFlags::Descending : QueryFlags::None}; const auto xpath{make_database(MU_TESTMAILDIR3)}; g_assert_false (xpath.empty());