1
0
mirror of https://github.com/djcb/mu.git synced 2024-06-23 07:16:48 +02:00

* initial somewhat working implementation of message threading based on the

JWZ algorithm
This commit is contained in:
Dirk-Jan C. Binnema 2011-06-30 23:44:50 +03:00
parent 28db904fb2
commit 15ac5f04c9
5 changed files with 1085 additions and 905 deletions

View File

@ -1,900 +0,0 @@
/* -*-mode: c; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-*/
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <math.h> /* for log, ceil */
#include <string.h> /* for memset */
#include "mu-msg-threader.h"
#include "mu-str.h"
/* msg threading implementation based on JWZ's algorithm, as described in:
* http://www.jwz.org/doc/threading.html
*
* the implementation follows the terminology from that doc, so should
* be understandable from that... I did change things a bit though
*
* the end result of the threading operation is a hashtable which maps
* docids (ie., Xapian documents == messages) to 'thread paths'; a
* thread path is a string denoting the 2-dimensional place of a
* message in a list of messages,
*
* Msg1 => 00000
* Msg2 => 00001
* Msg3 (child of Msg2) => 00001:00000
* Msg4 (child of Msg2) => 00001:00001
* Msg5 (child of Msg4) => 00001:00001:00000
* Msg6 => 00002
*
* the padding-0's are added to make them easy to sort using strcmp;
* the number hexadecimal numbers, and the length of the 'segments'
* (the parts separated by the ':') is equal to ceil(log_16(matchnum))
*
*/
/*
* path data structure, to determine the thread paths mentioned
* above
* */
struct _Path {
int *_data;
guint _len;
};
typedef struct _Path Path;
static Path* path_new (guint initial);
static void path_destroy (Path *p);
static void path_inc (Path *p, guint index);
static gchar* path_to_string (Path *p, const char* frmt);
/* Container data structure, as seen in the JWZ-doc*
*
*/
enum _ContainerFlag {
CONTAINER_FLAG_NONE = 0,
CONTAINER_FLAG_DELETE = 1 << 0,
CONTAINER_FLAG_SPLICE = 1 << 1,
CONTAINER_FLAG_DUP = 1 << 2
};
typedef guint8 ContainerFlag;
struct _Container {
struct _Container *parent, *child, *next;
ContainerFlag flags;
MuMsg *msg;
guint docid;
const char* msgid;
};
typedef struct _Container Container;
static Container* container_new (MuMsg *msg, guint docid, const char* msgid);
static void container_destroy (Container *c);
static void container_add_child (Container *c, Container *child);
static void container_add_sibling (Container *c, Container *sibling);
static void container_remove_child (Container *c, Container *child);
typedef gboolean (*ContainerForeachFunc) (Container*, gpointer);
static gboolean container_foreach (Container *c,
ContainerForeachFunc func,
gpointer user_data);
typedef void (*ContainerPathForeachFunc) (Container*, gpointer, Path*);
static void container_path_foreach (Container *c,
ContainerPathForeachFunc func,
gpointer user_data);
static void container_splice (Container *parent, Container *child);
size_t container_child_count (Container *c);
static gboolean container_reachable (Container *haystack, Container *needle);
static void container_dump (Container *c, gboolean recursive);
/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter);
/* step 2 */ static Container *find_root_set (GHashTable *ids);
static void prune_empty_containers (Container *root);
/* static void group_root_set_by_subject (GSList *root_set); */
GHashTable* create_doc_id_thread_path_hash (Container *root, size_t match_num);
static Container* sort_by_date (Container *root);
/* msg threading algorithm, based on JWZ's algorithm,
* http://www.jwz.org/doc/threading.html */
GHashTable*
mu_msg_threader_calculate (MuMsgIter *iter, size_t matchnum)
{
GHashTable *id_table, *thread_ids;
Container *root_set;
g_return_val_if_fail (iter, FALSE);
/* step 1 */
id_table = create_containers (iter);
/* step 2 -- the root_set is the list of children without parent */
root_set = find_root_set (id_table);
/* step 3: skip until the end; we still need to containers */
/* step 4: prune empty containers */
prune_empty_containers (root_set);
/* recalculate root set */
root_set = sort_by_date (root_set);
/* step 5: group root set by subject */
//group_root_set_by_subject (root_set);
/* sort */
mu_msg_iter_reset (iter); /* go all the way back */
/* finally, deliver the docid => thread-path hash */
thread_ids = create_doc_id_thread_path_hash (root_set,
matchnum);
g_hash_table_destroy (id_table); /* step 3*/
return thread_ids;
}
/* a referred message is a message that is refered by some other message */
static Container*
find_or_create_referred (GHashTable *id_table, const char *msgid)
{
Container *c;
g_return_val_if_fail (msgid, NULL);
c = g_hash_table_lookup (id_table, msgid);
if (!c) {
c = container_new (NULL, 0, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
}
return c;
}
/* find a container for the given msgid; if it does not exist yet,
* create a new one, and register it */
static Container*
find_or_create (GHashTable *id_table, MuMsg *msg, guint docid)
{
Container *c;
const char* msgid;
g_return_val_if_fail (msg, NULL);
g_return_val_if_fail (docid != 0, NULL);
msgid = mu_msg_get_msgid (msg);
if (!msgid)
msgid = mu_msg_get_path (msg); /* fake it */
c = g_hash_table_lookup (id_table, msgid);
/* If id_table contains an empty Container for this ID: * *
* Store this message in the Container's message slot. */
if (c && !c->msg) {
c->msg = mu_msg_ref (msg);
c->docid = docid;
return c;
} else if (!c) { /* Else: Create a new Container object holding
this message; Index the Container by
Message-ID in id_table. */
c = container_new (msg, docid, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
return c;
} else { /* c && c->msg */
/* special case, not in the JWZ algorithm: the
* container exists already and has a message; this
* means that we are seeing *another message* with a
* message-id we already saw... create this message,
* and mark it as a duplicate, and a child of the one
* we saw before; use its path as a fake message-id*/
Container *c2;
c2 = container_new (msg, docid, msgid);
c2->flags = CONTAINER_FLAG_DUP;
container_add_child (c, c2);
g_hash_table_insert (id_table,
(gpointer)mu_msg_get_path (msg), c2);
return NULL; /* don't process this message further */
}
}
static void /* 1B */
handle_references (GHashTable *id_table, Container *c)
{
const GSList *refs, *cur;
Container *parent;
refs = mu_msg_get_references (c->msg);
if (!refs)
return; /* nothing to do */
/* For each element in the message's References field:
Find a Container object for the given Message-ID: If
there's one in id_table use that; Otherwise, make (and
index) one with a null Message. */
/* go over over our list of refs, until 1 before the last... */
for (parent = NULL, cur = refs; cur; cur = g_slist_next (cur)) {
Container *child;
child = find_or_create_referred (id_table, (gchar*)cur->data);
/*Link the References field's Containers together in
* the order implied by the References header.
If they are already linked, don't change the existing
links. Do not add a link if adding that link would
introduce a loop: that is, before asserting A->B,
search down the children of B to see if A is
reachable, and also search down the children of A to
see if B is reachable. If either is already reachable
as a child of the other, don't add the link. */
if (parent &&
!container_reachable (parent, child) &&
!container_reachable (child, parent)) {
container_add_child (parent, child);
}
parent = child;
}
/* 'parent' points to the last ref: our direct parent;
Set the parent of this message to be the last element in
References. Note that this message may have a parent
already: this can happen because we saw this ID in a
References field, and presumed a parent based on the other
entries in that field. Now that we have the actual message,
we can be more definitive, so throw away the old parent and
use this new one. Find this Container in the parent's
children list, and unlink it.
Note that this could cause this message to now have no
parent, if it has no references field, but some message
referred to it as the non-first element of its
references. (Which would have been some kind of lie...)
Note that at all times, the various ``parent'' and ``child'' fields
must be kept inter-consistent. */
if (!container_reachable (parent, c) &&
!container_reachable (c, parent)) {
container_add_child (parent, c);
}
}
/* step 1: create the containers, connect them, and fill the id_table */
static GHashTable*
create_containers (MuMsgIter *iter)
{
GHashTable *id_table;
id_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
NULL,
(GDestroyNotify)container_destroy);
for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
mu_msg_iter_next (iter)) {
Container *c;
MuMsg *msg;
unsigned docid;
/* 1.A */
msg = mu_msg_iter_get_msg (iter, NULL);
docid = mu_msg_iter_get_docid (iter);
c = find_or_create (id_table, msg, docid);
/* 1.B and C */
if (c)
handle_references (id_table, c);
}
return id_table;
}
static void
filter_root_set (const gchar *msgid, Container *c, Container **root_set)
{
if (c->parent)
return;
if (*root_set == NULL)
*root_set = c;
else
container_add_sibling (*root_set, c);
}
/* 2. Walk over the elements of id_table, and gather a list of the
Container objects that have no parents, but do have children */
static Container*
find_root_set (GHashTable *ids)
{
Container *root_set;
root_set = NULL;
g_hash_table_foreach (ids, (GHFunc)filter_root_set, &root_set);
return root_set;
}
static gboolean
prune_maybe (Container *c)
{
Container *cur;
for (cur = c->child; cur; cur = cur->next) {
if (cur->flags & CONTAINER_FLAG_DELETE)
container_remove_child (c, cur);
else if (cur->flags & CONTAINER_FLAG_SPLICE)
container_splice (c, cur);
}
/* don't touch containers with messages */
if (c->msg)
return TRUE;
/* A. If it is an msg-less container with no children, mark it for deletion. */
if (!c->child) {
c->flags |= CONTAINER_FLAG_DELETE;
return TRUE;
}
/* B. If the Container has no Message, but does have
* children, remove this container but promote its
* children to this level (that is, splice them in to
* the current child list.)
*
* Do not promote the children if doing so would
* promote them to the root set -- unless there is
* only one child, in which case, do.
*/
if (!c->parent || (!c->parent->parent && container_child_count(c) != 1))
return TRUE;
c->flags |= CONTAINER_FLAG_SPLICE;
return TRUE;
}
static void
prune_empty_containers (Container *container)
{
Container *dummy, *cur;
dummy = container_new (NULL, 0, "DUMMY");
container_add_child (dummy, container);
container_foreach (dummy, (ContainerForeachFunc)prune_maybe, NULL);
for (cur = dummy->child; cur; cur = cur->next)
cur->parent = NULL;
container_destroy (dummy);
}
#if 0
/* 5. group root set by subject */
static void
group_root_set_by_subject (GSList *root_set)
{
GHashTable *subject_table;
GSList *cur;
/* A: Construct a new hash table, subject_table, which
* associates subject strings with Container objects. */
subject_table = g_hash_table_new (g_str_hash, g_str_equal);
for (cur = root_set; cur; cur = g_slist_next (cur)) {
const char *subject, *subj;
/* subject without Re: Fwd: etc. */
/* Find the subject of that sub-tree: */
Container *c;
c = (Container*)cur->data;
if (c->_msg)
/* (i) if there is a message in the Container, the
* subject is the subject of that message. */
subject = mu_msg_get_subject (c->_msg);
else
/* (ii )If there is no message in the Container,
* then the Container will have at least one
* child Container, and that Container will
* have a message. Use the subject of that
* message instead. */
subject = mu_msg_get_subject (
((Container*)(c->_children->data))->_msg);
/* (iii) Strip ``Re:'', ``RE:'', ``RE[5]:'', ``Re:
* Re[4]: Re:'' and so on. */
subj = subject ? mu_str_subject_normalize (subject) : NULL;
/* (iv )If the subject is now "", give up on this
* Container. */
if (mu_str_is_empty (subj))
continue;
}
}
#endif
G_GNUC_UNUSED static gint
cmp_dates (Container *c1, Container *c2)
{
MuMsg *m1, *m2;
m1 = c1->msg;
m2 = c2->msg;
if (!m1)
return m2 ? 1 : 0;
if (!m2)
return m1 ? 0 : 1;
return mu_msg_get_date (m1) - mu_msg_get_date (m2);
}
/* yuck ugly */
G_GNUC_UNUSED static Container*
sort_by_date (Container *c)
{
GSList *lst, *cur;
if (!c)
return NULL;
c->child = sort_by_date (c->child); /* recurse! */
for (lst = NULL; c; c = c->next) {
lst = g_slist_prepend (lst, c);
}
lst = g_slist_sort (lst, (GCompareFunc)cmp_dates);
c = (Container*)lst->data;
c->next = NULL;
cur = g_slist_next (lst);
for (;cur; cur = g_slist_next(cur)) {
Container *sib;
sib = (Container*)cur->data;
sib->next = NULL;
container_add_sibling (c, sib);
}
g_slist_free (lst);
return c;
}
static MuMsgIterThreadInfo*
thread_info_new (gchar *threadpath, gboolean root,
gboolean child, gboolean empty_parent, gboolean is_dup)
{
MuMsgIterThreadInfo *ti;
ti = g_slice_new (MuMsgIterThreadInfo);
ti->threadpath = threadpath;
ti->prop = 0;
ti->prop |= root ? MU_MSG_ITER_THREAD_PROP_ROOT : 0;
ti->prop |= child ? MU_MSG_ITER_THREAD_PROP_FIRST_CHILD : 0;
ti->prop |= empty_parent ? MU_MSG_ITER_THREAD_PROP_EMPTY_PARENT : 0;
ti->prop |= is_dup ? MU_MSG_ITER_THREAD_PROP_DUP : 0;
return ti;
}
static void
thread_info_destroy (MuMsgIterThreadInfo *ti)
{
if (ti) {
g_free (ti->threadpath);
g_slice_free (MuMsgIterThreadInfo, ti);
}
}
struct _ThreadInfo {
GHashTable *hash;
const char* format;
};
typedef struct _ThreadInfo ThreadInfo;
static void
add_to_thread_info_hash (GHashTable *thread_info_hash, Container *c,
char *threadpath)
{
gboolean is_root, child, empty_parent, is_dup;
/* 'root' means we're a child of the dummy root-container */
is_root = (c->parent == NULL);
child = is_root ? FALSE : (c->parent->child == c);
empty_parent = is_root ? FALSE : (!c->parent->msg);
is_dup = c->flags & CONTAINER_FLAG_DUP;
g_hash_table_insert (thread_info_hash,
GUINT_TO_POINTER(c->docid),
thread_info_new (threadpath,
is_root,
child,
empty_parent,
is_dup));
}
/* device a format string that is the minimum size to fit up to
* matchnum matches -- returns static memory */
const char*
thread_segment_format_string (size_t matchnum)
{
unsigned digitnum;
static char frmt[16];
/* get the number of digits needed in a hex-representation of
* matchnum */
digitnum = (unsigned) (ceil (log(matchnum)/log(16)));
snprintf (frmt, sizeof(frmt),"%%0%ux", digitnum);
return frmt;
}
static gboolean
add_thread_info (Container *c, ThreadInfo *ti, Path *path)
{
gchar *pathstr;
pathstr = path_to_string (path, ti->format);
add_to_thread_info_hash (ti->hash, c, pathstr);
return TRUE;
}
GHashTable*
create_doc_id_thread_path_hash (Container *root_set, size_t matchnum)
{
ThreadInfo ti;
/* create hash docid => thread-info */
ti.hash = g_hash_table_new_full (g_direct_hash, g_direct_equal,
NULL,
(GDestroyNotify)thread_info_destroy);
ti.format = thread_segment_format_string (matchnum);
container_path_foreach (root_set,
(ContainerPathForeachFunc)add_thread_info,
&ti);
return ti.hash;
}
static Container*
container_new (MuMsg *msg, guint docid, const char *msgid)
{
Container *c;
g_return_val_if_fail (!msg || docid != 0, NULL);
c = g_slice_new0 (Container);
if (msg)
c->msg = mu_msg_ref (msg);
c->docid = docid;
c->msgid = msgid;
return c;
}
static void
container_destroy (Container *c)
{
if (!c)
return;
if (c->msg)
mu_msg_unref (c->msg);
g_slice_free (Container, c);
}
static void
container_add_sibling (Container *c, Container *sibling)
{
Container *cur;
g_return_if_fail (c);
g_return_if_fail (sibling);
g_return_if_fail (c != sibling);
for (cur = sibling; cur; cur = cur->next)
cur->parent = c->parent;
for (cur = c; cur->next; cur = cur->next);
cur->next = sibling;
}
static void
container_add_child (Container *c, Container *child)
{
Container *cur;
g_return_if_fail (c);
g_return_if_fail (child);
g_return_if_fail (c != child);
for (cur = child; cur; cur = cur->next)
cur->parent = c;
if (!c->child)
c->child = child;
else {
for (cur = c->child; cur->next; cur = cur->next);
cur->next = child;
}
}
static void
container_remove_child (Container *c, Container *child)
{
Container *cur, *prev;
g_return_if_fail (c);
g_return_if_fail (child);
g_return_if_fail (!child->child);
g_return_if_fail (c != child);
/* g_print ("%s: %s <-- %s\n", __FUNCTION__, c->msgid, */
/* child->msgid); */
for (prev = NULL, cur = c->child; cur; cur = cur->next) {
if (cur == child) {
if (!prev)
c->child = cur->next;
else
prev->next = cur->next;
}
prev = cur;
}
}
static void
container_path_foreach_real (Container *c, guint level, Path *path,
ContainerPathForeachFunc func, gpointer user_data)
{
if (!c)
return;
path_inc (path, level);
func (c, user_data, path);
/* children */
container_path_foreach_real (c->child, level + 1, path, func, user_data);
/* siblings */
container_path_foreach_real (c->next, level, path, func, user_data);
}
static void
container_path_foreach (Container *c, ContainerPathForeachFunc func,
gpointer user_data)
{
Path *path;
path = path_new (100);
container_path_foreach_real (c, 0, path, func, user_data);
path_destroy (path);
}
static gboolean
container_foreach (Container *c, ContainerForeachFunc func, gpointer user_data)
{
if (!c)
return TRUE;
if (!container_foreach (c->child, func, user_data))
return FALSE; /* recurse into children */
/* recurse into siblings */
if (!container_foreach (c->next, func, user_data))
return FALSE;
return func (c, user_data);
}
static void
container_splice (Container *parent, Container *child)
{
g_return_if_fail (parent);
g_return_if_fail (child);
g_return_if_fail (parent != child);
/* g_print ("%s: %s <-- %s\n", __FUNCTION__, parent->msgid, */
/* child->msgid); */
container_add_child (parent, child->child);
child->child = NULL;
container_remove_child (parent, child);
}
size_t
container_child_count (Container *c)
{
size_t count;
Container *cur;
g_return_val_if_fail (c, 0);
for (count = 0, cur = c->child; cur; cur = cur->next)
++count;
return count;
}
static gboolean
different_container (Container *a, Container *b)
{
/* level == 0 so we don't compare with ourselves... */
return a != b;
}
static gboolean
container_reachable (Container *haystack, Container *needle)
{
return container_foreach (haystack,
(ContainerForeachFunc)different_container,
needle) ? FALSE : TRUE;
}
static gboolean
dump_container (Container *c)
{
const gchar* subject;
if (!c) {
g_print ("<empty>\n");
return TRUE;
}
subject = (c->msg) ? mu_msg_get_subject (c->msg) : "<none>";
g_print ("[%s][%s m:%p p:%p docid:%u]\n",c->msgid, subject, (void*)c,
(void*)c->parent, c->docid);
return TRUE;
}
G_GNUC_UNUSED static void
container_dump (Container *c, gboolean recursive)
{
if (!recursive)
dump_container (c);
else
container_foreach (c, (ContainerForeachFunc)dump_container,
NULL);
}
static Path*
path_new (guint initial)
{
Path *p;
p = g_slice_new0 (Path);
p->_data = g_new0 (int, initial);
p->_len = initial;
return p;
}
static void
path_destroy (Path *p)
{
if (!p)
return;
g_free (p->_data);
g_slice_free (Path, p);
}
static void
path_inc (Path *p, guint index)
{
if (index + 1 >= p->_len) {
p->_data = g_renew (int, p->_data, 2 * p->_len);
memset (&p->_data[p->_len], 0, p->_len);
p->_len *= 2;
}
++p->_data[index];
p->_data[index + 1] = 0;
}
static gchar*
path_to_string (Path *p, const char* frmt)
{
char *str;
guint u;
if (!p->_data)
return NULL;
for (u = 0, str = NULL; p->_data[u] != 0; ++u) {
char segm[16];
snprintf (segm, sizeof(segm), frmt, p->_data[u] - 1);
if (!str)
str = g_strdup (segm);
else {
gchar *tmp;
tmp = g_strdup_printf ("%s:%s", str, segm);
g_free (str);
str = tmp;
}
}
return str;
}

453
src/mu-threader-utils.c Normal file
View File

@ -0,0 +1,453 @@
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <string.h> /* for memset */
#include "mu-threader-utils.h"
#include "mu-msg.h"
struct _Path {
int *_data;
guint _len;
};
Container*
container_new (MuMsg *msg, guint docid, const char *msgid)
{
Container *c;
g_return_val_if_fail (!msg || docid != 0, NULL);
c = g_slice_new0 (Container);
if (msg)
c->msg = mu_msg_ref (msg);
c->docid = docid;
c->msgid = msgid;
return c;
}
void
container_destroy (Container *c)
{
if (!c)
return;
if (c->msg)
mu_msg_unref (c->msg);
g_slice_free (Container, c);
}
static void
set_parent (Container *c, Container *parent)
{
while (c) {
c->parent = parent;
c = c->next;
}
}
static Container*
find_last (Container *c)
{
while (c && c->next)
c = c->next;
return c;
}
static gboolean
check_dup (Container *c, GHashTable *hash)
{
if (g_hash_table_lookup (hash, c)) {
g_warning ("ALREADY!!");
container_dump (c, TRUE);
g_assert (0);
} else
g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE));
return TRUE;
}
G_GNUC_UNUSED static void
assert_no_duplicates (Container *c)
{
GHashTable *hash;
hash = g_hash_table_new (g_direct_hash, g_direct_equal);
container_foreach (c,
(ContainerForeachFunc)check_dup,
hash);
g_hash_table_destroy (hash);
}
Container*
container_append_siblings (Container *c, Container *sibling)
{
g_assert (c);
g_return_val_if_fail (c, NULL);
g_return_val_if_fail (sibling, NULL);
g_return_val_if_fail (c != sibling, NULL);
/* assert_no_duplicates (c); */
set_parent (sibling, c->parent);
(find_last(c))->next = sibling;
/* assert_no_duplicates (c); */
return c;
}
Container*
container_remove_sibling (Container *c, Container *sibling)
{
Container *cur, *prev;
g_return_val_if_fail (c, NULL);
g_return_val_if_fail (sibling, NULL);
for (prev = NULL, cur = c; cur; cur = cur->next) {
if (cur == sibling) {
if (!prev)
c = cur->next;
else
prev->next = cur->next;
break;
}
prev = cur;
}
return c;
}
Container*
container_append_children (Container *c, Container *child)
{
g_return_val_if_fail (c, NULL);
g_return_val_if_fail (child, NULL);
g_return_val_if_fail (c != child, NULL);
/* assert_no_duplicates (c); */
set_parent (child, c);
if (!c->child)
c->child = child;
else
c->child = container_append_siblings (c->child, child);
/* assert_no_duplicates (c->child); */
return c;
}
Container*
container_remove_child (Container *c, Container *child)
{
g_return_val_if_fail (c, NULL);
g_return_val_if_fail (child, NULL);
g_assert (!child->child);
g_return_val_if_fail (!child->child, NULL);
g_return_val_if_fail (c != child, NULL);
c->child = container_remove_sibling (c->child, child);
return c;
}
void
container_path_foreach_real (Container *c, guint level, Path *path,
ContainerPathForeachFunc func, gpointer user_data)
{
if (!c)
return;
path_inc (path, level);
func (c, user_data, path);
/* children */
container_path_foreach_real (c->child, level + 1, path, func, user_data);
/* siblings */
container_path_foreach_real (c->next, level, path, func, user_data);
}
void
container_path_foreach (Container *c, ContainerPathForeachFunc func,
gpointer user_data)
{
Path *path;
path = path_new (100);
container_path_foreach_real (c, 0, path, func, user_data);
path_destroy (path);
}
gboolean
container_foreach (Container *c, ContainerForeachFunc func, gpointer user_data)
{
if (!c)
return TRUE;
if (!container_foreach (c->child, func, user_data))
return FALSE; /* recurse into children */
/* recurse into siblings */
if (!container_foreach (c->next, func, user_data))
return FALSE;
return func (c, user_data);
}
Container*
container_splice_children (Container *parent, Container *child)
{
Container *newchild;
g_return_val_if_fail (parent, NULL);
g_return_val_if_fail (child, NULL);
g_return_val_if_fail (parent != child, NULL);
newchild = child->child;
child->child=NULL;
container_remove_child (parent, child);
return container_append_children (parent, newchild);
}
GSList*
container_to_list (Container *c)
{
GSList *lst;
for (lst = NULL; c; c = c->next)
lst = g_slist_prepend (lst, c);
return lst;
}
static Container*
container_from_list (GSList *lst)
{
Container *c, *cur;
if (!lst)
return NULL;
for (c = cur = (Container*)lst->data; cur; lst = g_slist_next(lst)) {
cur->next = lst ? (Container*)lst->data : NULL;
cur=cur->next;
}
return c;
}
struct _SortFuncData {
GCompareDataFunc func;
gboolean invert;
gpointer user_data;
};
typedef struct _SortFuncData SortFuncData;
static int
sort_func_wrapper (Container *a, Container *b, SortFuncData *data)
{
Container *a1, *b1;
/* use the first non-empty 'left child' message if this one
* is */
for (a1 = a; a1->msg == NULL && a1->child != NULL; a1 = a1->child);
for (b1 = b; b1->msg == NULL && b1->child != NULL; b1 = b1->child);
if (data->invert)
return data->func (b1, a1, data->user_data);
else
return data->func (a1, b1, data->user_data);
}
static Container*
container_sort_real (Container *c, SortFuncData *sfdata)
{
GSList *lst;
Container *cur;
if (!c)
return NULL;
for (cur = c; cur; cur = cur->next)
if (cur->child)
cur->child = container_sort_real (cur->child, sfdata);
/* sort siblings */
lst = container_to_list (c);
lst = g_slist_sort_with_data(lst,
(GCompareDataFunc)sort_func_wrapper,
sfdata);
c = container_from_list (lst);
g_slist_free (lst);
return c;
}
Container *
container_sort (Container *c, GCompareDataFunc func, gpointer user_data,
gboolean invert)
{
SortFuncData sfdata = { func, invert, user_data };
return container_sort_real (c, &sfdata);
}
static gboolean
unequal (Container *a, Container *b)
{
return a == b ? FALSE : TRUE;
}
gboolean
container_reachable (Container *haystack, Container *needle)
{
if (!container_foreach
(haystack, (ContainerForeachFunc)unequal, needle))
return TRUE;
return FALSE;
}
static gboolean
dump_container (Container *c)
{
const gchar* subject;
if (!c) {
g_print ("<empty>\n");
return TRUE;
}
subject = (c->msg) ? mu_msg_get_subject (c->msg) : "<none>";
g_print ("[%s][%s m:%p p:%p docid:%u %s]\n",c->msgid, subject, (void*)c,
(void*)c->parent, c->docid,
c->msg ? mu_msg_get_path (c->msg) : "");
return TRUE;
}
void
container_dump (Container *c, gboolean recursive)
{
if (!recursive)
dump_container (c);
else
container_foreach (c, (ContainerForeachFunc)dump_container,
NULL);
}
Path*
path_new (guint initial)
{
Path *p;
p = g_slice_new0 (Path);
p->_data = g_new0 (int, initial);
p->_len = initial;
return p;
}
void
path_destroy (Path *p)
{
if (!p)
return;
g_free (p->_data);
g_slice_free (Path, p);
}
void
path_inc (Path *p, guint index)
{
if (index + 1 >= p->_len) {
p->_data = g_renew (int, p->_data, 2 * p->_len);
memset (&p->_data[p->_len], 0, p->_len);
p->_len *= 2;
}
++p->_data[index];
p->_data[index + 1] = 0;
}
gchar*
path_to_string (Path *p, const char* frmt)
{
char *str;
guint u;
if (!p->_data)
return NULL;
for (u = 0, str = NULL; p->_data[u] != 0; ++u) {
char segm[16];
snprintf (segm, sizeof(segm), frmt, p->_data[u] - 1);
if (!str)
str = g_strdup (segm);
else {
gchar *tmp;
tmp = g_strdup_printf ("%s:%s", str, segm);
g_free (str);
str = tmp;
}
}
return str;
}

84
src/mu-threader-utils.h Normal file
View File

@ -0,0 +1,84 @@
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#ifndef __MU_THREADER_UTILS_H__
#define __MU_THREADER_UTILS_H__
#include <glib.h>
#include <mu-msg.h>
/*
* path data structure, to determine the thread paths mentioned
* above
* */
struct _Path;
typedef struct _Path Path;
Path* path_new (guint initial);
void path_destroy (Path *p);
void path_inc (Path *p, guint index);
gchar* path_to_string (Path *p, const char* frmt);
/* Container data structure, as seen in the JWZ-doc*
*
*/
enum _ContainerFlag {
CONTAINER_FLAG_NONE = 0,
CONTAINER_FLAG_DELETE = 1 << 0,
CONTAINER_FLAG_SPLICE = 1 << 1,
CONTAINER_FLAG_DUP = 1 << 2
};
typedef guint8 ContainerFlag;
struct _Container {
struct _Container *parent, *child, *next;
ContainerFlag flags;
MuMsg *msg;
guint docid;
const char* msgid;
};
typedef struct _Container Container;
Container* container_new (MuMsg *msg, guint docid, const char* msgid);
void container_destroy (Container *c);
Container* container_append_children (Container *c, Container *child);
Container* container_append_siblings (Container *c, Container *sibling);
Container* container_remove_child (Container *c, Container *child);
Container* container_remove_sibling (Container *c, Container *sibling);
Container* container_splice_children (Container *parent, Container *child);
typedef gboolean (*ContainerForeachFunc) (Container*, gpointer);
gboolean container_foreach (Container *c,
ContainerForeachFunc func,
gpointer user_data);
typedef void (*ContainerPathForeachFunc) (Container*, gpointer, Path*);
void container_path_foreach (Container *c,
ContainerPathForeachFunc func,
gpointer user_data);
gboolean container_reachable (Container *haystack, Container *needle);
void container_dump (Container *c, gboolean recursive);
typedef int (*ContainerCmpFunc) (Container *c1, Container *c2, gpointer user_data);
Container * container_sort (Container *c, GCompareDataFunc func,
gpointer user_data, gboolean invert);
#endif /*__MU_THREADER_UTILS_H__*/

544
src/mu-threader.c Normal file
View File

@ -0,0 +1,544 @@
/* -*-mode: c; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-*/
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <math.h> /* for log, ceil */
#include <string.h> /* for memset */
#include "mu-threader.h"
#include "mu-threader-utils.h"
#include "mu-str.h"
/* msg threading implementation based on JWZ's algorithm, as described in:
* http://www.jwz.org/doc/threading.html
*
* the implementation follows the terminology from that doc, so should
* be understandable from that... I did change things a bit though
*
* the end result of the threading operation is a hashtable which maps
* docids (ie., Xapian documents == messages) to 'thread paths'; a
* thread path is a string denoting the 2-dimensional place of a
* message in a list of messages,
*
* Msg1 => 00000
* Msg2 => 00001
* Msg3 (child of Msg2) => 00001:00000
* Msg4 (child of Msg2) => 00001:00001
* Msg5 (child of Msg4) => 00001:00001:00000
* Msg6 => 00002
*
* the padding-0's are added to make them easy to sort using strcmp;
* the number hexadecimal numbers, and the length of the 'segments'
* (the parts separated by the ':') is equal to ceil(log_16(matchnum))
*
*/
/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter);
/* step 2 */ static Container *find_root_set (GHashTable *ids);
static Container* prune_empty_containers (Container *root);
/* static void group_root_set_by_subject (GSList *root_set); */
GHashTable* create_doc_id_thread_path_hash (Container *root, size_t match_num);
static gint cmp_dates (Container *c1, Container *c2);
/* msg threading algorithm, based on JWZ's algorithm,
* http://www.jwz.org/doc/threading.html */
GHashTable*
mu_threader_calculate (MuMsgIter *iter, size_t matchnum)
{
GHashTable *id_table, *thread_ids;
Container *root_set;
g_return_val_if_fail (iter, FALSE);
/* step 1 */
id_table = create_containers (iter);
/* step 2 -- the root_set is the list of children without parent */
root_set = find_root_set (id_table);
/* step 3: skip until the end; we still need to containers */
/* step 4: prune empty containers */
root_set = prune_empty_containers (root_set);
/* sort root set */
root_set = container_sort (root_set, (GCompareDataFunc)cmp_dates,
NULL, FALSE);
/* step 5: group root set by subject */
//group_root_set_by_subject (root_set);
/* sort */
mu_msg_iter_reset (iter); /* go all the way back */
/* finally, deliver the docid => thread-path hash */
thread_ids = create_doc_id_thread_path_hash (root_set,
matchnum);
g_hash_table_destroy (id_table); /* step 3*/
return thread_ids;
}
static void
check_dup (const char *msgid, Container *c, GHashTable *hash)
{
if (g_hash_table_lookup (hash, c)) {
g_warning ("ALREADY!!");
container_dump (c, FALSE);
g_assert (0);
} else
g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE));
}
static void
assert_no_duplicates (GHashTable *ids)
{
GHashTable *hash;
hash = g_hash_table_new (g_direct_hash, g_direct_equal);
g_hash_table_foreach (ids,
(GHFunc)check_dup,
hash);
g_hash_table_destroy (hash);
}
/* a referred message is a message that is refered by some other message */
static Container*
find_or_create_referred (GHashTable *id_table, const char *msgid,
gboolean *created)
{
Container *c;
g_return_val_if_fail (msgid, NULL);
c = g_hash_table_lookup (id_table, msgid);
*created = !c;
if (!c) {
c = container_new (NULL, 0, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
assert_no_duplicates (id_table);
}
return c;
}
/* find a container for the given msgid; if it does not exist yet,
* create a new one, and register it */
static Container*
find_or_create (GHashTable *id_table, MuMsg *msg, guint docid)
{
Container *c;
const char* msgid;
g_return_val_if_fail (msg, NULL);
g_return_val_if_fail (docid != 0, NULL);
msgid = mu_msg_get_msgid (msg);
if (!msgid)
msgid = mu_msg_get_path (msg); /* fake it */
c = g_hash_table_lookup (id_table, msgid);
/* If id_table contains an empty Container for this ID: * *
* Store this message in the Container's message slot. */
if (c) {
if (!c->msg) {
c->msg = mu_msg_ref (msg);
c->docid = docid;
return c;
} else {
/* c && c->msg */
/* special case, not in the JWZ algorithm: the
* container exists already and has a message; this
* means that we are seeing *another message* with a
* message-id we already saw... create this message,
* and mark it as a duplicate, and a child of the one
* we saw before; use its path as a fake message-id*/
Container *c2;
c2 = container_new (msg, docid, "<dup>");
c2->flags = CONTAINER_FLAG_DUP;
c = container_append_children (c, c2);
g_hash_table_insert (id_table,
(gpointer)mu_msg_get_path (msg), c2);
assert_no_duplicates (id_table);
return NULL; /* don't process this message further */
}
} else { /* Else: Create a new Container object holding
this message; Index the Container by
Message-ID in id_table. */
c = container_new (msg, docid, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
assert_no_duplicates (id_table);
return c;
}
}
static gboolean
child_elligible (Container *parent, Container *child, gboolean created)
{
if (!parent || !child)
return FALSE;
if (child->parent)
return FALSE;
/* if (created) */
/* return TRUE; */
if (container_reachable (parent, child))
return FALSE;
if (container_reachable (child, parent))
return FALSE;
return TRUE;
}
static void /* 1B */
handle_references (GHashTable *id_table, Container *c)
{
const GSList *refs, *cur;
Container *parent;
gboolean created;
refs = mu_msg_get_references (c->msg);
if (!refs)
return; /* nothing to do */
/* For each element in the message's References field:
Find a Container object for the given Message-ID: If
there's one in id_table use that; Otherwise, make (and
index) one with a null Message. */
/* go over over our list of refs, until 1 before the last... */
created = FALSE;
for (parent = NULL, cur = refs; cur; cur = g_slist_next (cur)) {
Container *child;
child = find_or_create_referred (id_table, (gchar*)cur->data,
&created);
/*Link the References field's Containers together in
* the order implied by the References header.
If they are already linked, don't change the existing
links. Do not add a link if adding that link would
introduce a loop: that is, before asserting A->B,
search down the children of B to see if A is
reachable, and also search down the children of A to
see if B is reachable. If either is already reachable
as a child of the other, don't add the link. */
if (child_elligible (parent, child, created))
parent = container_append_children (parent, child);
parent = child;
}
/* 'parent' points to the last ref: our direct parent;
Set the parent of this message to be the last element in
References. Note that this message may have a parent
already: this can happen because we saw this ID in a
References field, and presumed a parent based on the other
entries in that field. Now that we have the actual message,
we can be more definitive, so throw away the old parent and
use this new one. Find this Container in the parent's
children list, and unlink it.
Note that this could cause this message to now have no
parent, if it has no references field, but some message
referred to it as the non-first element of its
references. (Which would have been some kind of lie...)
Note that at all times, the various ``parent'' and ``child'' fields
must be kept inter-consistent. */
/* optimization: if the the message was newly added, it's by
* definition not reachable yet */
if (child_elligible (parent, c, created))
parent = container_append_children (parent, c);
}
/* step 1: create the containers, connect them, and fill the id_table */
static GHashTable*
create_containers (MuMsgIter *iter)
{
GHashTable *id_table;
id_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
NULL,
(GDestroyNotify)container_destroy);
for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
mu_msg_iter_next (iter)) {
Container *c;
MuMsg *msg;
unsigned docid;
/* 1.A */
msg = mu_msg_iter_get_msg (iter, NULL);
docid = mu_msg_iter_get_docid (iter);
c = find_or_create (id_table, msg, docid);
/* 1.B and C */
if (c)
handle_references (id_table, c);
}
return id_table;
}
static void
filter_root_set (const gchar *msgid, Container *c, Container **root_set)
{
if (c->parent)
return;
if (*root_set == NULL) {
*root_set = c;
return;
} else
*root_set = container_append_siblings (*root_set, c);
}
/* 2. Walk over the elements of id_table, and gather a list of the
Container objects that have no parents, but do have children */
static Container*
find_root_set (GHashTable *ids)
{
Container *root_set;
root_set = NULL;
g_hash_table_foreach (ids, (GHFunc)filter_root_set, &root_set);
return root_set;
}
static gboolean
prune_maybe (Container *c)
{
Container *cur;
for (cur = c->child; cur; cur = cur->next) {
if (cur->flags & CONTAINER_FLAG_DELETE)
c = container_remove_child (c, cur);
else if (cur->flags & CONTAINER_FLAG_SPLICE)
c = container_splice_children (c, cur);
}
/* don't touch containers with messages */
if (c->msg)
return TRUE;
/* A. If it is an msg-less container with no children, mark it
* for deletion. */
if (!c->child) {
c->flags |= CONTAINER_FLAG_DELETE;
return TRUE;
}
/* B. If the Container has no Message, but does have
* children, remove this container but promote its
* children to this level (that is, splice them in to
* the current child list.)
*
* Do not promote the children if doing so would
* promote them to the root set -- unless there is
* only one child, in which case, do.
*/
if (c->child->next) /* ie., > 1 child */
return TRUE;
c->flags |= CONTAINER_FLAG_SPLICE;
return TRUE;
}
static Container*
prune_empty_containers (Container *root_set)
{
Container *cur;
container_foreach (root_set, (ContainerForeachFunc)prune_maybe, NULL);
/* and prune the root_set itself... */
for (cur = root_set; cur; cur = cur->next) {
if (cur->flags & CONTAINER_FLAG_DELETE)
root_set = container_remove_sibling (root_set, cur);
else if (cur->flags & CONTAINER_FLAG_SPLICE) {
Container *newchild;
newchild = cur->child;
cur->child = NULL;
root_set = container_append_siblings (root_set, newchild);
}
}
return root_set;
}
G_GNUC_UNUSED static gint
cmp_dates (Container *c1, Container *c2)
{
MuMsg *m1, *m2;
m1 = c1->msg;
m2 = c2->msg;
if (!m1)
return m2 ? 1 : 0;
if (!m2)
return m1 ? 0 : 1;
return mu_msg_get_date (m1) - mu_msg_get_date (m2);
}
static MuMsgIterThreadInfo*
thread_info_new (gchar *threadpath, gboolean root,
gboolean child, gboolean empty_parent, gboolean is_dup)
{
MuMsgIterThreadInfo *ti;
ti = g_slice_new (MuMsgIterThreadInfo);
ti->threadpath = threadpath;
ti->prop = 0;
ti->prop |= root ? MU_MSG_ITER_THREAD_PROP_ROOT : 0;
ti->prop |= child ? MU_MSG_ITER_THREAD_PROP_FIRST_CHILD : 0;
ti->prop |= empty_parent ? MU_MSG_ITER_THREAD_PROP_EMPTY_PARENT : 0;
ti->prop |= is_dup ? MU_MSG_ITER_THREAD_PROP_DUP : 0;
return ti;
}
static void
thread_info_destroy (MuMsgIterThreadInfo *ti)
{
if (ti) {
g_free (ti->threadpath);
g_slice_free (MuMsgIterThreadInfo, ti);
}
}
struct _ThreadInfo {
GHashTable *hash;
const char* format;
};
typedef struct _ThreadInfo ThreadInfo;
static void
add_to_thread_info_hash (GHashTable *thread_info_hash, Container *c,
char *threadpath)
{
gboolean is_root, first_child, empty_parent, is_dup;
/* 'root' means we're a child of the dummy root-container */
is_root = (c->parent == NULL);
first_child = is_root ? FALSE : (c->parent->child == c);
empty_parent = is_root ? FALSE : (!c->parent->msg);
is_dup = c->flags & CONTAINER_FLAG_DUP;
g_hash_table_insert (thread_info_hash,
GUINT_TO_POINTER(c->docid),
thread_info_new (threadpath,
is_root,
first_child,
empty_parent,
is_dup));
}
/* device a format string that is the minimum size to fit up to
* matchnum matches -- returns static memory */
const char*
thread_segment_format_string (size_t matchnum)
{
unsigned digitnum;
static char frmt[16];
/* get the number of digits needed in a hex-representation of
* matchnum */
digitnum = (unsigned) (ceil (log(matchnum)/log(16)));
snprintf (frmt, sizeof(frmt),"%%0%ux", digitnum);
return frmt;
}
static gboolean
add_thread_info (Container *c, ThreadInfo *ti, Path *path)
{
gchar *pathstr;
pathstr = path_to_string (path, ti->format);
add_to_thread_info_hash (ti->hash, c, pathstr);
return TRUE;
}
GHashTable*
create_doc_id_thread_path_hash (Container *root_set, size_t matchnum)
{
ThreadInfo ti;
/* create hash docid => thread-info */
ti.hash = g_hash_table_new_full (g_direct_hash, g_direct_equal,
NULL,
(GDestroyNotify)thread_info_destroy);
ti.format = thread_segment_format_string (matchnum);
container_path_foreach (root_set,
(ContainerPathForeachFunc)add_thread_info,
&ti);
return ti.hash;
}

View File

@ -19,9 +19,8 @@
**
*/
#ifndef __MU_MSG_THREADER_H__
#define __MU_MSG_THREADER_H__
#ifndef __MU_THREADER_H__
#define __MU_THREADER_H__
#include <glib.h>
#include <mu-msg-iter.h>
@ -45,9 +44,9 @@ G_BEGIN_DECLS
*
* @return a hashtable; free with g_hash_table_destroy when done with it
*/
GHashTable *mu_msg_threader_calculate (MuMsgIter *iter, size_t matches);
GHashTable *mu_threader_calculate (MuMsgIter *iter, size_t matches);
G_END_DECLS
#endif /*__MU_MSG_THREADER_H__*/
#endif /*__MU_THREADER_H__*/