mu/src/mu-threader.c

545 lines
14 KiB
C
Raw Normal View History

/* -*-mode: c; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-*/
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <math.h> /* for log, ceil */
#include <string.h> /* for memset */
#include "mu-threader.h"
#include "mu-threader-utils.h"
#include "mu-str.h"
/* msg threading implementation based on JWZ's algorithm, as described in:
* http://www.jwz.org/doc/threading.html
*
* the implementation follows the terminology from that doc, so should
* be understandable from that... I did change things a bit though
*
* the end result of the threading operation is a hashtable which maps
* docids (ie., Xapian documents == messages) to 'thread paths'; a
* thread path is a string denoting the 2-dimensional place of a
* message in a list of messages,
*
* Msg1 => 00000
* Msg2 => 00001
* Msg3 (child of Msg2) => 00001:00000
* Msg4 (child of Msg2) => 00001:00001
* Msg5 (child of Msg4) => 00001:00001:00000
* Msg6 => 00002
*
* the padding-0's are added to make them easy to sort using strcmp;
* the number hexadecimal numbers, and the length of the 'segments'
* (the parts separated by the ':') is equal to ceil(log_16(matchnum))
*
*/
/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter);
/* step 2 */ static Container *find_root_set (GHashTable *ids);
static Container* prune_empty_containers (Container *root);
/* static void group_root_set_by_subject (GSList *root_set); */
GHashTable* create_doc_id_thread_path_hash (Container *root, size_t match_num);
static gint cmp_dates (Container *c1, Container *c2);
/* msg threading algorithm, based on JWZ's algorithm,
* http://www.jwz.org/doc/threading.html */
GHashTable*
mu_threader_calculate (MuMsgIter *iter, size_t matchnum)
{
GHashTable *id_table, *thread_ids;
Container *root_set;
g_return_val_if_fail (iter, FALSE);
/* step 1 */
id_table = create_containers (iter);
/* step 2 -- the root_set is the list of children without parent */
root_set = find_root_set (id_table);
/* step 3: skip until the end; we still need to containers */
/* step 4: prune empty containers */
root_set = prune_empty_containers (root_set);
/* sort root set */
root_set = container_sort (root_set, (GCompareDataFunc)cmp_dates,
NULL, FALSE);
/* step 5: group root set by subject */
//group_root_set_by_subject (root_set);
/* sort */
mu_msg_iter_reset (iter); /* go all the way back */
/* finally, deliver the docid => thread-path hash */
thread_ids = create_doc_id_thread_path_hash (root_set,
matchnum);
g_hash_table_destroy (id_table); /* step 3*/
return thread_ids;
}
static void
check_dup (const char *msgid, Container *c, GHashTable *hash)
{
if (g_hash_table_lookup (hash, c)) {
g_warning ("ALREADY!!");
container_dump (c, FALSE);
g_assert (0);
} else
g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE));
}
static void
assert_no_duplicates (GHashTable *ids)
{
GHashTable *hash;
hash = g_hash_table_new (g_direct_hash, g_direct_equal);
g_hash_table_foreach (ids,
(GHFunc)check_dup,
hash);
g_hash_table_destroy (hash);
}
/* a referred message is a message that is refered by some other message */
static Container*
find_or_create_referred (GHashTable *id_table, const char *msgid,
gboolean *created)
{
Container *c;
g_return_val_if_fail (msgid, NULL);
c = g_hash_table_lookup (id_table, msgid);
*created = !c;
if (!c) {
c = container_new (NULL, 0, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
assert_no_duplicates (id_table);
}
return c;
}
/* find a container for the given msgid; if it does not exist yet,
* create a new one, and register it */
static Container*
find_or_create (GHashTable *id_table, MuMsg *msg, guint docid)
{
Container *c;
const char* msgid;
g_return_val_if_fail (msg, NULL);
g_return_val_if_fail (docid != 0, NULL);
msgid = mu_msg_get_msgid (msg);
if (!msgid)
msgid = mu_msg_get_path (msg); /* fake it */
c = g_hash_table_lookup (id_table, msgid);
/* If id_table contains an empty Container for this ID: * *
* Store this message in the Container's message slot. */
if (c) {
if (!c->msg) {
c->msg = mu_msg_ref (msg);
c->docid = docid;
return c;
} else {
/* c && c->msg */
/* special case, not in the JWZ algorithm: the
* container exists already and has a message; this
* means that we are seeing *another message* with a
* message-id we already saw... create this message,
* and mark it as a duplicate, and a child of the one
* we saw before; use its path as a fake message-id*/
Container *c2;
c2 = container_new (msg, docid, "<dup>");
c2->flags = CONTAINER_FLAG_DUP;
c = container_append_children (c, c2);
g_hash_table_insert (id_table,
(gpointer)mu_msg_get_path (msg), c2);
assert_no_duplicates (id_table);
return NULL; /* don't process this message further */
}
} else { /* Else: Create a new Container object holding
this message; Index the Container by
Message-ID in id_table. */
c = container_new (msg, docid, msgid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
assert_no_duplicates (id_table);
return c;
}
}
static gboolean
child_elligible (Container *parent, Container *child, gboolean created)
{
if (!parent || !child)
return FALSE;
if (child->parent)
return FALSE;
/* if (created) */
/* return TRUE; */
if (container_reachable (parent, child))
return FALSE;
if (container_reachable (child, parent))
return FALSE;
return TRUE;
}
static void /* 1B */
handle_references (GHashTable *id_table, Container *c)
{
const GSList *refs, *cur;
Container *parent;
gboolean created;
refs = mu_msg_get_references (c->msg);
if (!refs)
return; /* nothing to do */
/* For each element in the message's References field:
Find a Container object for the given Message-ID: If
there's one in id_table use that; Otherwise, make (and
index) one with a null Message. */
/* go over over our list of refs, until 1 before the last... */
created = FALSE;
for (parent = NULL, cur = refs; cur; cur = g_slist_next (cur)) {
Container *child;
child = find_or_create_referred (id_table, (gchar*)cur->data,
&created);
/*Link the References field's Containers together in
* the order implied by the References header.
If they are already linked, don't change the existing
links. Do not add a link if adding that link would
introduce a loop: that is, before asserting A->B,
search down the children of B to see if A is
reachable, and also search down the children of A to
see if B is reachable. If either is already reachable
as a child of the other, don't add the link. */
if (child_elligible (parent, child, created))
parent = container_append_children (parent, child);
parent = child;
}
/* 'parent' points to the last ref: our direct parent;
Set the parent of this message to be the last element in
References. Note that this message may have a parent
already: this can happen because we saw this ID in a
References field, and presumed a parent based on the other
entries in that field. Now that we have the actual message,
we can be more definitive, so throw away the old parent and
use this new one. Find this Container in the parent's
children list, and unlink it.
Note that this could cause this message to now have no
parent, if it has no references field, but some message
referred to it as the non-first element of its
references. (Which would have been some kind of lie...)
Note that at all times, the various ``parent'' and ``child'' fields
must be kept inter-consistent. */
/* optimization: if the the message was newly added, it's by
* definition not reachable yet */
if (child_elligible (parent, c, created))
parent = container_append_children (parent, c);
}
/* step 1: create the containers, connect them, and fill the id_table */
static GHashTable*
create_containers (MuMsgIter *iter)
{
GHashTable *id_table;
id_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
NULL,
(GDestroyNotify)container_destroy);
for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
mu_msg_iter_next (iter)) {
Container *c;
MuMsg *msg;
unsigned docid;
/* 1.A */
msg = mu_msg_iter_get_msg (iter, NULL);
docid = mu_msg_iter_get_docid (iter);
c = find_or_create (id_table, msg, docid);
/* 1.B and C */
if (c)
handle_references (id_table, c);
}
return id_table;
}
static void
filter_root_set (const gchar *msgid, Container *c, Container **root_set)
{
if (c->parent)
return;
if (*root_set == NULL) {
*root_set = c;
return;
} else
*root_set = container_append_siblings (*root_set, c);
}
/* 2. Walk over the elements of id_table, and gather a list of the
Container objects that have no parents, but do have children */
static Container*
find_root_set (GHashTable *ids)
{
Container *root_set;
root_set = NULL;
g_hash_table_foreach (ids, (GHFunc)filter_root_set, &root_set);
return root_set;
}
static gboolean
prune_maybe (Container *c)
{
Container *cur;
for (cur = c->child; cur; cur = cur->next) {
if (cur->flags & CONTAINER_FLAG_DELETE)
c = container_remove_child (c, cur);
else if (cur->flags & CONTAINER_FLAG_SPLICE)
c = container_splice_children (c, cur);
}
/* don't touch containers with messages */
if (c->msg)
return TRUE;
/* A. If it is an msg-less container with no children, mark it
* for deletion. */
if (!c->child) {
c->flags |= CONTAINER_FLAG_DELETE;
return TRUE;
}
/* B. If the Container has no Message, but does have
* children, remove this container but promote its
* children to this level (that is, splice them in to
* the current child list.)
*
* Do not promote the children if doing so would
* promote them to the root set -- unless there is
* only one child, in which case, do.
*/
if (c->child->next) /* ie., > 1 child */
return TRUE;
c->flags |= CONTAINER_FLAG_SPLICE;
return TRUE;
}
static Container*
prune_empty_containers (Container *root_set)
{
Container *cur;
container_foreach (root_set, (ContainerForeachFunc)prune_maybe, NULL);
/* and prune the root_set itself... */
for (cur = root_set; cur; cur = cur->next) {
if (cur->flags & CONTAINER_FLAG_DELETE)
root_set = container_remove_sibling (root_set, cur);
else if (cur->flags & CONTAINER_FLAG_SPLICE) {
Container *newchild;
newchild = cur->child;
cur->child = NULL;
root_set = container_append_siblings (root_set, newchild);
}
}
return root_set;
}
G_GNUC_UNUSED static gint
cmp_dates (Container *c1, Container *c2)
{
MuMsg *m1, *m2;
m1 = c1->msg;
m2 = c2->msg;
if (!m1)
return m2 ? 1 : 0;
if (!m2)
return m1 ? 0 : 1;
return mu_msg_get_date (m1) - mu_msg_get_date (m2);
}
static MuMsgIterThreadInfo*
thread_info_new (gchar *threadpath, gboolean root,
gboolean child, gboolean empty_parent, gboolean is_dup)
{
MuMsgIterThreadInfo *ti;
ti = g_slice_new (MuMsgIterThreadInfo);
ti->threadpath = threadpath;
ti->prop = 0;
ti->prop |= root ? MU_MSG_ITER_THREAD_PROP_ROOT : 0;
ti->prop |= child ? MU_MSG_ITER_THREAD_PROP_FIRST_CHILD : 0;
ti->prop |= empty_parent ? MU_MSG_ITER_THREAD_PROP_EMPTY_PARENT : 0;
ti->prop |= is_dup ? MU_MSG_ITER_THREAD_PROP_DUP : 0;
return ti;
}
static void
thread_info_destroy (MuMsgIterThreadInfo *ti)
{
if (ti) {
g_free (ti->threadpath);
g_slice_free (MuMsgIterThreadInfo, ti);
}
}
struct _ThreadInfo {
GHashTable *hash;
const char* format;
};
typedef struct _ThreadInfo ThreadInfo;
static void
add_to_thread_info_hash (GHashTable *thread_info_hash, Container *c,
char *threadpath)
{
gboolean is_root, first_child, empty_parent, is_dup;
/* 'root' means we're a child of the dummy root-container */
is_root = (c->parent == NULL);
first_child = is_root ? FALSE : (c->parent->child == c);
empty_parent = is_root ? FALSE : (!c->parent->msg);
is_dup = c->flags & CONTAINER_FLAG_DUP;
g_hash_table_insert (thread_info_hash,
GUINT_TO_POINTER(c->docid),
thread_info_new (threadpath,
is_root,
first_child,
empty_parent,
is_dup));
}
/* device a format string that is the minimum size to fit up to
* matchnum matches -- returns static memory */
const char*
thread_segment_format_string (size_t matchnum)
{
unsigned digitnum;
static char frmt[16];
/* get the number of digits needed in a hex-representation of
* matchnum */
digitnum = (unsigned) (ceil (log(matchnum)/log(16)));
snprintf (frmt, sizeof(frmt),"%%0%ux", digitnum);
return frmt;
}
static gboolean
add_thread_info (Container *c, ThreadInfo *ti, Path *path)
{
gchar *pathstr;
pathstr = path_to_string (path, ti->format);
add_to_thread_info_hash (ti->hash, c, pathstr);
return TRUE;
}
GHashTable*
create_doc_id_thread_path_hash (Container *root_set, size_t matchnum)
{
ThreadInfo ti;
/* create hash docid => thread-info */
ti.hash = g_hash_table_new_full (g_direct_hash, g_direct_equal,
NULL,
(GDestroyNotify)thread_info_destroy);
ti.format = thread_segment_format_string (matchnum);
container_path_foreach (root_set,
(ContainerPathForeachFunc)add_thread_info,
&ti);
return ti.hash;
}