lib: implement new query parser

mu's query parser is the piece of software that turns your queries into something the Xapian database can understand. So, if you query "maildir:/inbox and subject:bla" this must be translated into a Xapian::Query object which will retrieve the sought after messages. Since mu's beginning, almost a decade ago, this parser was based on Xapian's default Xapian::QueryParser. It works okay, but wasn't really designed for the mu use-case, and had a bit of trouble with anything that's not A..Z (think: spaces, special characters, unicode etc.). Over the years, mu added quite a bit of pre-processing trickery to deal with that. Still, there were corner cases and bugs that were practically unfixable. The solution to all of this is to have a custom query processor that replaces Xapian's, and write it from the ground up to deal with the special characters etc. I wrote one, as part of my "future, post-1.0 mu" reseach project, and I have now backported it to the mu 0.9.19. From a technical perspective, this is a major cleanup, and allows us to get rid of much of the fragile preprocessing both for indexing and querying. From and end-user perspective this (hopefully) means that many of the little parsing issues are gone, and it opens the way for some new features. From an end-user perspective: - better support for special characters. - regexp search! yes, you can now search for regular expressions, e.g. subject:/h.ll?o/ will find subjects with hallo, hello, halo, philosophy, ... As you can imagine, this can be a _heavy_ operation on the database, and might take quite a bit longer than a normal query; but it can be quite useful.
2017-10-24 22:55:35 +03:00 · 2017-10-24 22:55:35 +03:00 · b75f9f508b
parent b53366313b
commit b75f9f508b
18 changed files with 2208 additions and 0 deletions
--- a/lib/parser/Makefile.am
+++ b/lib/parser/Makefile.am
@ -0,0 +1,87 @@
+## Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software Foundation,
+## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+include $(top_srcdir)/gtest.mk
+
+@VALGRIND_CHECK_RULES@
+
+noinst_PROGRAMS=		\
+	tokenize		\
+	parse
+
+tokenize_SOURCES=		\
+	tokenize.cc
+
+tokenize_LDADD=			\
+	$(GCOV_LDADD)		\
+	libmuxparser.la
+
+parse_SOURCES=			\
+	parse.cc
+
+parse_LDADD=			\
+	$(GCOV_LDADD)		\
+	libmuxparser.la
+
+AM_CXXFLAGS=			\
+	-I$(srcdir)/..		\
+	-I$(top_srcdir)/lib	\
+	$(GLIB_CFLAGS)		\
+	$(XAPIAN_CXXFLAGS)	\
+	$(WARN_CXXFLAGS)	\
+	$(GCOV_CFLAGS)		\
+	-Wno-inline		\
+	-Wno-switch-enum
+
+libmuxparser_la_LIBADD=		\
+	$(WARN_LDFLAGS)		\
+	$(GLIB_LIBS)		\
+	$(XAPIAN_LIBS)		\
+	$(GCOV_LDADD)
+
+noinst_LTLIBRARIES=		\
+	libmuxparser.la
+
+libmuxparser_la_SOURCES=	\
+	data.hh			\
+	parser.cc		\
+	parser.hh		\
+	proc-iface.hh		\
+	tokenizer.cc		\
+	tokenizer.hh		\
+	tree.hh			\
+	utils.cc		\
+	utils.hh		\
+	xapian.cc		\
+	xapian.hh
+
+VALGRIND_SUPPRESSIONS_FILES= ${top_srcdir}/mux.supp
+
+noinst_PROGRAMS+=$(TEST_PROGS)
+
+TEST_PROGS += test-tokenizer
+test_tokenizer_SOURCES=test-tokenizer.cc
+test_tokenizer_LDADD=libmuxparser.la
+
+TEST_PROGS += test-parser
+test_parser_SOURCES=test-parser.cc
+test_parser_LDADD=libmuxparser.la
+
+TEST_PROGS += test-utils
+test_utils_SOURCES=test-utils.cc
+test_utils_LDADD=libmuxparser.la
+
+TESTS=$(TEST_PROGS)
--- a/lib/parser/data.hh
+++ b/lib/parser/data.hh
@ -0,0 +1,151 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#ifndef __DATA_HH__
+#define __DATA_HH__
+
+#include <string>
+#include <iostream>
+#include <regex>
+
+#include <parser/utils.hh>
+
+namespace Mux {
+
+// class representing some data item; either a Value or a Range a Value can still be a Regex (but
+// that's not a separate type here)
+struct Data {
+	enum class Type { Value, Range };
+	virtual ~Data() = default;
+
+	Type		type;	/**< type of data */
+	std::string	field;  /**< full name of the field */
+	std::string	prefix;	/**< Xapian prefix for thef field */
+	unsigned	id;	/**< Xapian value no for the field  */
+
+protected:
+	Data (Type _type, const std::string& _field, const std::string& _prefix,
+	      unsigned _id): type(_type), field(_field), prefix(_prefix), id(_id) {}
+};
+
+
+/**
+ * operator<<
+ *
+ * @param os an output stream
+ * @param t a data type
+ *
+ * @return the updated output stream
+ */
+inline std::ostream&
+operator<< (std::ostream& os, Data::Type t)
+{
+	switch (t) {
+	case Data::Type::Value: os << "value"; break;
+	case Data::Type::Range: os << "range"; break;
+	default: os << "bug"; break;
+	}
+	return os;
+}
+
+
+/**
+ *  Range type -- [a..b]
+ */
+struct Range: public Data {
+	/**
+	 * Construct a range
+	 *
+	 * @param _field the field
+	 * @param _prefix the xapian prefix
+	 * @param _id xapian value number
+	 * @param _lower lower bound
+	 * @param _upper upper bound
+	 */
+	Range (const std::string& _field, const std::string& _prefix,
+	       unsigned _id,
+	       const std::string& _lower,const std::string& _upper):
+
+		Data(Data::Type::Range, _field, _prefix, _id),
+		lower(_lower), upper(_upper) {}
+
+	std::string lower;	/**< lower bound */
+	std::string upper;	/**< upper bound */
+};
+
+
+/**
+ * Basic value
+ *
+ */
+struct Value: public Data {
+	/**
+	 * Construct a Value
+	 *
+	 * @param _field the field
+	 * @param _prefix the xapian prefix
+	 * @param _id xapian value number
+	 * @param _value the value
+	 */
+	Value (const std::string& _field, const std::string& _prefix,
+	       unsigned _id, const std::string& _value):
+		Data(Value::Type::Value, _field, _prefix, _id),
+		value(_value) {}
+
+	std::string value;	/**< the value */
+};
+
+
+/**
+ * operator<<
+ *
+ * @param os an output stream
+ * @param v a data ptr
+ *
+ * @return the updated output stream
+ */
+inline std::ostream&
+operator<< (std::ostream& os, const std::unique_ptr<Data>& v)
+{
+	switch (v->type) {
+	case Data::Type::Value: {
+		const auto bval = dynamic_cast<Value*> (v.get());
+		os << ' ' << quote(v->field) << ' '
+		   << quote(utf8_flatten(bval->value));
+		break;
+	}
+	case Data::Type::Range: {
+		const auto rval = dynamic_cast<Range*> (v.get());
+		os << ' ' << quote(v->field) << ' '
+		   << quote(rval->lower) << ' '
+		   << quote(rval->upper);
+		break;
+	}
+	default:
+		os << "unexpected type";
+		break;
+	}
+
+	return os;
+}
+
+} // namespace Mux
+
+
+#endif /* __DATA_HH__ */
--- a/lib/parser/dummy-processor.hh
+++ b/lib/parser/dummy-processor.hh
@ -0,0 +1,30 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+#ifndef __DUMMY_PROCESSOR_HH__
+#define __DUMMY_PROCESSOR_HH__
+
+#include <string>
+#include <vector>
+#include <tuple>
+
+namespace Mux {
+
+
+
+#endif /* __FIELDS_HH__ */
--- a/lib/parser/parse.cc
+++ b/lib/parser/parse.cc
@ -0,0 +1,41 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <string>
+#include <iostream>
+#include "parser.hh"
+
+int
+main (int argc, char *argv[])
+{
+	std::string s;
+
+	for (auto i = 1; i < argc; ++i)
+		s += " " + std::string(argv[i]);
+
+	Mux::WarningVec warnings;
+
+	const auto tree = Mux::parse (s, warnings);
+	for (const auto& w: warnings)
+		std::cerr << "1:" << w.pos << ": " << w.msg << std::endl;
+
+	std::cout << tree << std::endl;
+
+	return 0;
+}
--- a/lib/parser/parser.cc
+++ b/lib/parser/parser.cc
@ -0,0 +1,346 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+#include "parser.hh"
+#include "tokenizer.hh"
+#include "utils.hh"
+
+using namespace Mux;
+
+// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND)
+
+// query	->      <term-1> | ε
+// <term-1>	->	<factor-1> <term-2> | ε
+// <term-2>	->	OR|XOR <term-1> | ε
+// <factor-1>	->	<unit> <factor-2> | ε
+// <factor-2>	->	[AND]|AND NOT <factor-1> | ε
+// <unit>	->	[NOT] <term-1> | ( <term-1> ) | <data>
+// <data>       ->      <value> | <range> | <regex>
+// <value>      ->      [field:]value
+// <range>      ->      [field:][lower]..[upper]
+// <regex>      ->      [field:]/regex/
+
+
+#define BUG(...) std::runtime_error (format("%u: BUG: ",__LINE__)	\
+				     + format(__VA_ARGS__))
+
+static Token
+look_ahead (const Mux::Tokens& tokens)
+{
+	return tokens.front();
+}
+
+static Mux::Tree
+empty()
+{
+	return {{Node::Type::Empty}};
+}
+
+static Mux::Tree term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings);
+
+
+static Mux::Tree
+value (const ProcIface::FieldInfoVec& fields, const std::string& v,
+       size_t pos, ProcPtr proc, WarningVec& warnings)
+{
+	auto val = utf8_flatten(v);
+
+	if (fields.empty())
+		throw BUG("expected one or more fields");
+
+	if (fields.size() == 1) {
+		const auto item = fields.front();
+		return Tree({Node::Type::Value,
+			     std::make_unique<Value>(
+				     item.field, item.prefix, item.id,
+				     proc->process_value(item.field, val))});
+	}
+
+	// a 'multi-field' such as "recip:"
+	Tree tree(Node{Node::Type::OpOr});
+	for (const auto& item: fields)
+		tree.add_child (Tree({Node::Type::Value,
+				      std::make_unique<Value>(
+					      item.field, item.prefix, item.id,
+					      proc->process_value(item.field, val))}));
+	return tree;
+}
+
+static Mux::Tree
+regex (const ProcIface::FieldInfoVec& fields, const std::string& v,
+       size_t pos, ProcPtr proc, WarningVec& warnings)
+{
+	if (v.length() < 2)
+		throw BUG("expected regexp, got '%s'", v.c_str());
+
+	const auto rxstr = utf8_flatten(v.substr(1, v.length()-2));
+
+ 	try {
+		Tree tree(Node{Node::Type::OpOr});
+		const auto rx = std::regex (rxstr);
+		for (const auto& field: fields) {
+			const auto terms = proc->process_regex (field.field, rx);
+			for (const auto& term: terms) {
+				tree.add_child (Tree(
+					{Node::Type::Value,
+					 std::make_unique<Value>(field.field, "",
+								 field.id, term)}));
+			}
+		}
+		return tree;
+
+	} catch (...) {
+		// fallback
+		warnings.push_back ({pos, "invalid regexp"});
+		return value (fields, v, pos, proc, warnings);
+	}
+}
+
+
+
+static Mux::Tree
+range (const ProcIface::FieldInfoVec& fields, const std::string& lower,
+       const std::string& upper, size_t pos, ProcPtr proc,
+       WarningVec& warnings)
+{
+	if (fields.empty())
+		throw BUG("expected field");
+
+	const auto& field = fields.front();
+	if (!proc->is_range_field(field.field))
+		return value (fields, lower + ".." + upper, pos, proc, warnings);
+
+	auto prange = proc->process_range (field.field, lower, upper);
+	if (prange.lower > prange.upper)
+		prange = proc->process_range (field.field, upper, lower);
+
+	return Tree({{Node::Type::Range},
+			     std::make_unique<Range>(field.field, field.prefix, field.id,
+						     prange.lower, prange.upper)});
+}
+
+
+static Mux::Tree
+data (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
+{
+	const auto token = look_ahead(tokens);
+	if (token.type != Token::Type::Data)
+		warnings.push_back ({token.pos, "expected: value"});
+
+	tokens.pop_front();
+
+	std::string field, val;
+	const auto col = token.str.find (":");
+	if (col != 0 && col != std::string::npos && col != token.str.length()-1) {
+		field = token.str.substr(0, col);
+		val = token.str.substr(col + 1);
+	} else
+		val = token.str;
+
+	auto fields = proc->process_field (field);
+	if (fields.empty()) {// not valid field...
+		warnings.push_back ({token.pos, format ("invalid field '%s'", field.c_str())});
+		fields = proc->process_field ("");
+		// fallback, treat the whole of foo:bar as a value
+		return value (fields, field + ":" + val, token.pos, proc, warnings);
+	}
+
+	// does it look like a regexp?
+	if (val.length()>=2) {
+		if (val[0]=='/' && val[val.length()-1] == '/')
+			return regex (fields, val, token.pos, proc, warnings);
+		else if (val[val.length()-1] == '*')
+			return regex (fields, // transfrom wildcard into regexp
+				      "/" + val.substr(0, val.length()-1) + ".*/",
+				      token.pos, proc, warnings);
+	}
+
+	// does it look like a range?
+	const auto dotdot = val.find("..");
+	if (dotdot != std::string::npos)
+		return range(fields, val.substr(0, dotdot), val.substr(dotdot + 2),
+			     token.pos, proc, warnings);
+
+	// if nothing else, it's a value.
+	return value (fields, val, token.pos, proc, warnings);
+}
+
+static Mux::Tree
+unit (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
+{
+	if (tokens.empty()) {
+		warnings.push_back ({0, "expected: unit"});
+		return empty();
+	}
+
+	const auto token = look_ahead (tokens);
+
+	if (token.type == Token::Type::Not) {
+		tokens.pop_front();
+		Tree tree{{Node::Type::OpNot}};
+		tree.add_child(unit (tokens, proc, warnings));
+		return tree;
+	}
+
+	if (token.type == Token::Type::Open) {
+		tokens.pop_front();
+		auto tree = term_1 (tokens, proc, warnings);
+		if (tokens.empty())
+			warnings.push_back({token.pos, "expected: ')'"});
+		else {
+			const auto token2 = look_ahead(tokens);
+			if (token2.type == Token::Type::Close)
+				tokens.pop_front();
+			else {
+				warnings.push_back(
+				{token2.pos,
+				 std::string("expected: ')' but got ") +
+				 token2.str});
+			}
+
+		}
+		return tree;
+	}
+
+	return data (tokens, proc, warnings);
+}
+
+static Mux::Tree factor_1 (Mux::Tokens& tokens, ProcPtr proc,
+			   WarningVec& warnings);
+
+static Mux::Tree
+factor_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc,
+	  WarningVec& warnings)
+{
+	if (tokens.empty())
+		return empty();
+
+	const auto token = look_ahead(tokens);
+
+	switch (token.type) {
+	case Token::Type::And: {
+		tokens.pop_front();
+		const auto token2 = look_ahead(tokens);
+		if (token2.type == Token::Type::Not) { // AND NOT is a unit
+			tokens.pop_front();
+			op = Node::Type::OpAndNot;
+		} else
+			op = Node::Type::OpAnd;
+	} break;
+	case Token::Type::Open:
+	case Token::Type::Data:
+		op = Node::Type::OpAnd; // implicit AND
+		break;
+	case Token::Type::Not:
+		tokens.pop_front();
+		op = Node::Type::OpAndNot; // implicit AND NOT
+		break;
+
+
+	default:
+		return empty();
+	}
+
+	return factor_1 (tokens, proc, warnings);
+}
+
+static Mux::Tree
+factor_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
+{
+	Node::Type op { Node::Type::Invalid };
+
+	auto t  = unit (tokens, proc, warnings);
+	auto a2 = factor_2 (tokens, op, proc, warnings);
+
+	if (a2.empty())
+		return t;
+
+	Tree tree {{op}};
+	tree.add_child(std::move(t));
+	tree.add_child(std::move(a2));
+
+	return tree;
+}
+
+
+static Mux::Tree
+term_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc,
+	WarningVec& warnings)
+{
+	if (tokens.empty())
+		return empty();
+
+	const auto token = look_ahead (tokens);
+
+	switch (token.type) {
+	case Token::Type::Or:
+		op = Node::Type::OpOr;
+		break;
+	case Token::Type::Xor:
+		op = Node::Type::OpXor;
+		break;
+	default:
+		if (token.type != Token::Type::Close)
+			warnings.push_back({token.pos, "expected OR|XOR"});
+		return empty();
+	}
+
+	tokens.pop_front();
+
+	return term_1 (tokens, proc, warnings);
+}
+
+static Mux::Tree
+term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
+{
+	Node::Type op { Node::Type::Invalid };
+
+	auto t  = factor_1 (tokens, proc, warnings);
+	auto o2 = term_2 (tokens, op, proc, warnings);
+
+	if (o2.empty())
+		return t;
+	else {
+		Tree tree {{op}};
+		tree.add_child(std::move(t));
+		tree.add_child(std::move(o2));
+		return tree;
+	}
+}
+
+static Mux::Tree
+query (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
+{
+	if (tokens.empty())
+		return empty ();
+	else
+		return term_1 (tokens, proc, warnings);
+}
+
+Mux::Tree
+Mux::parse (const std::string& expr, WarningVec& warnings, ProcPtr proc)
+{
+	try {
+		auto tokens = tokenize (expr);
+		return query (tokens, proc, warnings);
+
+	} catch (const std::runtime_error& ex) {
+		std::cerr << ex.what() << std::endl;
+		return empty();
+	}
+}
--- a/lib/parser/parser.hh
+++ b/lib/parser/parser.hh
@ -0,0 +1,89 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+
+#ifndef __PARSER_HH__
+#define __PARSER_HH__
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include <parser/data.hh>
+#include <parser/tree.hh>
+#include <parser/proc-iface.hh>
+
+// A simple recursive-descent parser for queries. Follows the Xapian syntax,
+// but better handles non-alphanum; also implements regexp
+
+namespace Mux {
+
+/**
+ * A parser warning
+ *
+ */
+struct Warning {
+	size_t			pos; /**< pos in string */
+	const std::string	msg; /**< warning message */
+
+	/**
+	 * operator==
+	 *
+	 * @param rhs right-hand side
+	 *
+	 * @return true if rhs is equal to this; false otherwise
+	 */
+	bool operator==(const Warning& rhs) const {
+		return pos == rhs.pos && msg == rhs.msg;
+	}
+};
+
+
+/**
+ * operator<<
+ *
+ * @param os an output stream
+ * @param w a warning
+ *
+ * @return the updated output stream
+ */
+inline std::ostream&
+operator<< (std::ostream& os, const Warning& w)
+{
+	os << w.pos << ":" << w.msg;
+	return os;
+}
+
+/**
+ * Parse a query string
+ *
+ * @param query a query string
+ * @param warnings vec to receive warnings
+ * @param proc a Processor object
+ *
+ * @return a parse-tree
+ */
+using WarningVec=std::vector<Warning>;
+using ProcPtr = const std::unique_ptr<ProcIface>&;
+Tree parse (const std::string& query, WarningVec& warnings,
+	    ProcPtr proc = std::make_unique<DummyProc>());
+
+} // namespace Mux
+
+#endif /* __PARSER_HH__ */
--- a/lib/parser/proc-iface.hh
+++ b/lib/parser/proc-iface.hh
@ -0,0 +1,131 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+#ifndef __PROC_IFACE_HH__
+#define __PROC_IFACE_HH__
+
+#include <string>
+#include <vector>
+#include <tuple>
+#include <regex>
+
+namespace Mux {
+
+struct ProcIface {
+
+	virtual ~ProcIface() = default;
+
+	/**
+	 * Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none
+	 *
+	 * @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field
+	 *
+	 * @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map
+	 * to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">]
+	 */
+	struct FieldInfo {
+		const std::string	field;
+		const std::string	prefix;
+		unsigned		id;
+	};
+	using FieldInfoVec = std::vector<FieldInfo>;
+
+	virtual FieldInfoVec process_field (const std::string& field) const = 0;
+
+	/**
+	 * Process a value
+	 *
+	 * @param field a field name
+	 * @param value a value
+	 *
+	 * @return the processed value
+	 */
+	virtual std::string process_value (
+		const std::string& field, const std::string& value) const = 0;
+
+	/**
+	 * Is this a range field?
+	 *
+	 * @param field some field
+	 *
+	 * @return true if it is a range-field; false otherwise.
+	 */
+	virtual bool is_range_field (const std::string& field) const = 0;
+
+
+	/**
+	 * Process a range field
+	 *
+	 * @param fieldstr a fieldstr, e.g "date" or "d" for the date field
+	 * @param lower lower bound or empty
+	 * @param upper upper bound or empty
+	 *
+	 * @return the processed range
+	 */
+	struct Range {
+		std::string lower;
+		std::string upper;
+	};
+	virtual Range process_range (const std::string& field, const std::string& lower,
+				     const std::string& upper) const = 0;
+
+	/**
+	 *
+	 *
+	 * @param field
+	 * @param rx
+	 *
+	 * @return
+	 */
+	virtual std::vector<std::string>
+	process_regex (const std::string& field, const std::regex& rx) const = 0;
+
+}; // ProcIface
+
+
+struct DummyProc: public ProcIface { // For testing
+
+	std::vector<FieldInfo>
+	process_field (const std::string& field) const override {
+		return {{ field, "x", 0 }};
+	}
+
+	std::string
+	process_value (const std::string& field, const std::string& value) const override {
+		return value;
+	}
+
+	bool is_range_field (const std::string& field) const override {
+		return false;
+	}
+
+	Range process_range (const std::string& field, const std::string& lower,
+			     const std::string& upper) const override {
+		return { lower, upper };
+	}
+
+	std::vector<std::string>
+	process_regex (const std::string& field, const std::regex& rx) const override {
+		return {};
+	}
+}; //Dummy
+
+
+} // Mux
+
+#endif /* __PROC_IFACE_HH__ */
--- a/lib/parser/test-parser.cc
+++ b/lib/parser/test-parser.cc
@ -0,0 +1,121 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <vector>
+#include <glib.h>
+
+#include <iostream>
+#include <sstream>
+
+#include "parser.hh"
+using namespace Mux;
+
+struct Case {
+	const std::string expr;
+	const std::string expected;
+	WarningVec warnings;
+};
+
+using CaseVec = std::vector<Case>;
+
+static void
+test_cases(const CaseVec& cases)
+{
+	for (const auto& casus : cases ) {
+
+		WarningVec warnings;
+		const auto tree = parse (casus.expr, warnings);
+
+		std::stringstream ss;
+		ss << tree;
+
+		if (g_test_verbose()) {
+			std::cout << "\n";
+			std::cout << casus.expr << std::endl;
+			std::cout << "exp:" << casus.expected << std::endl;
+			std::cout << "got:" << ss.str() << std::endl;
+		}
+		g_assert_true (casus.expected == ss.str());
+
+		// g_assert_cmpuint (casus.warnings.size(), ==, warnings.size());
+		// for (auto i = 0; i != (int)casus.warnings.size(); ++i) {
+		// 	std::cout << "exp:" << casus.warnings[i] << std::endl;
+		// 	std::cout << "got:" << warnings[i] << std::endl;
+		// 	g_assert_true (casus.warnings[i] == warnings[i]);
+		// }
+	}
+}
+
+static void
+test_basic ()
+{
+	CaseVec cases = {
+		//{ "", R"#((atom :value ""))#"},
+		{ "foo",  R"#((value "" "foo"))#", },
+		{ "foo       or         bar",
+		  R"#((or(value "" "foo")(value "" "bar")))#" },
+		{ "foo and bar",
+		  R"#((and(value "" "foo")(value "" "bar")))#"},
+	};
+
+	test_cases (cases);
+}
+
+static void
+test_complex ()
+{
+	CaseVec cases = {
+		{ "foo and bar or cuux",
+		  R"#((or(and(value "" "foo")(value "" "bar")))#" +
+		  std::string(R"#((value "" "cuux")))#") },
+		{ "a and not b",
+		  R"#((andnot(value "" "a")(value "" "b")))#"
+		},
+		{ "a and b and c",
+		  R"#((and(value "" "a")(and(value "" "b")(value "" "c"))))#"
+		},
+		{ "(a or b) and c",
+		  R"#((and(or(value "" "a")(value "" "b"))(value "" "c")))#"
+		}
+	};
+
+	test_cases (cases);
+}
+
+static void
+test_flatten ()
+{
+	CaseVec cases = {
+		{ " Mötørhęåđ", R"#((value "" "motorhead"))#" }
+	};
+
+	test_cases (cases);
+}
+
+int
+main (int argc, char *argv[])
+{
+	g_test_init (&argc, &argv, NULL);
+
+	g_test_add_func ("/parser/basic",    test_basic);
+	g_test_add_func ("/parser/complex",  test_complex);
+	g_test_add_func ("/parser/flatten",  test_flatten);
+
+	return g_test_run ();
+}
--- a/lib/parser/test-tokenizer.cc
+++ b/lib/parser/test-tokenizer.cc
@ -0,0 +1,143 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <vector>
+#include <glib.h>
+#include <iostream>
+
+#include "tokenizer.hh"
+
+struct Case {
+	const char *str;
+	const Mux::Tokens tokens;
+};
+
+using CaseVec = std::vector<Case>;
+
+using namespace Mux;
+using TT = Token::Type;
+
+static void
+test_cases(const CaseVec& cases)
+{
+	for (const auto& casus : cases ) {
+		const auto tokens = tokenize (casus.str);
+
+		g_assert_cmpuint ((guint)tokens.size(),==,(guint)casus.tokens.size());
+		for (size_t u = 0; u != tokens.size(); ++u) {
+			if (g_test_verbose()) {
+				std::cerr << "case " << u << " " << casus.str << std::endl;
+				std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl;
+				std::cerr << "got: '" << tokens[u] << "'" << std::endl;
+
+			}
+			g_assert_true (tokens[u] == casus.tokens[u]);
+		}
+	}
+}
+
+static void
+test_basic ()
+{
+	CaseVec cases = {
+		{ "", {} },
+
+		{ "foo",  Tokens{Token{3, TT::Data, "foo"}}},
+
+		{ "foo bar cuux",  Tokens{Token{3, TT::Data, "foo"},
+					    Token{7, TT::Data, "bar"},
+					    Token{12, TT::Data, "cuux"}}},
+
+		{ "\"foo bar\"",  Tokens{ Token{9, TT::Data, "foo bar"}}},
+
+		// ie. ignore missing closing '"'
+		{ "\"foo bar",    Tokens{ Token{8, TT::Data, "foo bar"}}},
+
+	};
+
+	test_cases (cases);
+}
+
+static void
+test_specials ()
+{
+	CaseVec cases = {
+		{ ")*(",  Tokens{Token{1, TT::Close, ")"},
+				   Token{2, TT::Data, "*"},
+				   Token{3, TT::Open, "("}}},
+		{ "\")*(\"",  Tokens{Token{5, TT::Data, ")*("}}},
+	};
+
+	test_cases (cases);
+}
+
+
+static void
+test_ops ()
+{
+	CaseVec cases = {
+		{ "foo and bar oR cuux XoR fnorb",
+		  Tokens{Token{3, TT::Data, "foo"},
+			   Token{7, TT::And,   "and"},
+			   Token{11, TT::Data, "bar"},
+			   Token{14, TT::Or,    "oR"},
+			   Token{19, TT::Data, "cuux"},
+			   Token{23, TT::Xor,   "XoR"},
+			   Token{29, TT::Data, "fnorb"}}},
+		{ "NOT (aap or mies)",
+		  Tokens{Token{3,  TT::Not,    "NOT"},
+			   Token{5,  TT::Open,   "("},
+			   Token{8,  TT::Data,  "aap"},
+			   Token{11, TT::Or,     "or"},
+			   Token{16, TT::Data,  "mies"},
+			   Token{17, TT::Close,   ")"}}}
+	};
+
+
+	test_cases (cases);
+}
+
+
+static void
+test_escape ()
+{
+	CaseVec cases = {
+		{ "foo\"bar\"",      Tokens{Token{3, TT::Data, "foo"},
+					      Token{8, TT::Data, "bar"}}},
+		{ "\"fnorb\"",      Tokens{Token{7, TT::Data, "fnorb"}}},
+		{ "\\\"fnorb\\\"",  Tokens{Token{9, TT::Data, "\"fnorb\""}}},
+		{ "foo\\\"bar\\\"",  Tokens{Token{10, TT::Data, "foo\"bar\""}}}
+	};
+
+	test_cases (cases);
+}
+
+
+int
+main (int argc, char *argv[])
+{
+	g_test_init (&argc, &argv, NULL);
+
+	g_test_add_func ("/tokens/basic", test_basic);
+	g_test_add_func ("/tokens/specials", test_specials);
+	g_test_add_func ("/tokens/ops", test_ops);
+	g_test_add_func ("/tokens/escape", test_escape);
+
+	return g_test_run ();
+}
--- a/lib/parser/test-utils.cc
+++ b/lib/parser/test-utils.cc
@ -0,0 +1,95 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <vector>
+#include <glib.h>
+
+#include <iostream>
+#include <sstream>
+
+#include "parser.hh"
+using namespace Mux;
+
+struct Case {
+	const std::string	expr;
+	bool			is_first;
+	const std::string	expected;
+};
+using CaseVec = std::vector<Case>;
+using ProcFunc = std::function<std::string(std::string, bool)>;
+
+
+static void
+test_cases(const CaseVec& cases, ProcFunc proc)
+{
+	for (const auto& casus : cases ) {
+
+		const auto res = proc(casus.expr, casus.is_first);
+		if (g_test_verbose()) {
+			std::cout << "\n";
+			std::cout << casus.expr << ' ' << casus.is_first << std::endl;
+			std::cout << "exp:" << casus.expected << std::endl;
+			std::cout << "got:" << res << std::endl;
+		}
+
+		g_assert_true (casus.expected == res);
+	}
+}
+
+static void
+test_date ()
+{
+	g_setenv ("TZ", "Europe/Helsinki", TRUE);
+
+	CaseVec cases = {
+		{ "2015-09-18T09:10:23", true,  "001442556623" },
+		{ "1972-12-14T09:10:23", true,  "000093165023" },
+		{ "1854-11-18T17:10:23", true,  "000000000000" },
+		{ "fnorb",               true,  "000000000000" },
+		{ "fnorb",               false, "999999999999" },
+		{ "",                    false, "999999999999" },
+		{ "",                    true,  "000000000000" }
+	};
+
+	test_cases (cases, [](auto s, auto f){ return date_to_time_t_string(s,f); });
+}
+
+static void
+test_size ()
+{
+	CaseVec cases = {
+		{ "456", true,  "0000000456" },
+		{ "",    false, "9999999999" },
+		{ "",    true,  "0000000000" },
+	};
+
+	test_cases (cases, [](auto s, auto f){ return size_to_string(s,f); });
+}
+
+
+int
+main (int argc, char *argv[])
+{
+	g_test_init (&argc, &argv, NULL);
+
+	g_test_add_func ("/utils/process-date",  test_date);
+	g_test_add_func ("/utils/process-size",  test_size);
+
+	return g_test_run ();
+}
--- a/lib/parser/tokenize.cc
+++ b/lib/parser/tokenize.cc
@ -0,0 +1,38 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <string>
+#include <iostream>
+
+#include "tokenizer.hh"
+
+int
+main (int argc, char *argv[])
+{
+	std::string s;
+
+	for (auto i = 1; i < argc; ++i)
+		s += " " + std::string(argv[i]);
+
+	const auto tvec = Mux::tokenize (s);
+	for (const auto& t : tvec)
+		std::cout << t << std::endl;
+
+	return 0;
+}
--- a/lib/parser/tokenizer.cc
+++ b/lib/parser/tokenizer.cc
@ -0,0 +1,128 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include "tokenizer.hh"
+#include <cctype>
+#include <iostream>
+#include <algorithm>
+
+using namespace Mux;
+
+static bool
+is_separator (char c)
+{
+	const auto seps = std::string (":()\"");
+
+	if (isblank(c))
+		return true;
+	else
+		return seps.find(c) != std::string::npos;
+}
+
+
+static Mux::Token
+op_or_value (size_t pos, const std::string& val)
+{
+	auto s = val;
+	std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+
+	if (s == "and")
+		return Token{pos, Token::Type::And, val};
+	else if (s == "or")
+		return Token{pos, Token::Type::Or, val};
+	else if (s == "xor")
+		return Token{pos, Token::Type::Xor, val};
+	else if (s == "not")
+		return Token{pos, Token::Type::Not, val};
+	else
+		return Token{pos, Token::Type::Data, val};
+}
+
+static void
+unread_char (std::string& food, char kar, size_t& pos)
+{
+	food = kar + food;
+	--pos;
+}
+
+static Mux::Token
+eat_token (std::string& food, size_t& pos)
+{
+	bool quoted{};
+	bool escaped{};
+	std::string value {};
+
+	while (!food.empty()) {
+
+		const auto kar = food[0];
+		food.erase(0, 1);
+		++pos;
+
+		if (kar == '\\') {
+			escaped = !escaped;
+			if (escaped)
+				continue;
+		}
+
+		if (kar == '"' && !escaped && quoted)
+			return Token{pos, Token::Type::Data, value};
+
+		if (!quoted && !escaped && is_separator(kar)) {
+
+			if (!value.empty() && kar != ':') {
+				unread_char (food, kar, pos);
+				return op_or_value(pos, value);
+			}
+
+			if (kar == '"')
+				quoted = true;
+
+			if (quoted || isblank(kar))
+				continue;
+
+			switch (kar) {
+			case '(': return {pos, Token::Type::Open, "("};
+			case ')': return {pos, Token::Type::Close,")"};
+			default: break;
+			}
+		}
+
+		value	+= kar;
+		escaped	 = false;
+	}
+
+	return {pos, Token::Type::Data, value};
+}
+
+
+Mux::Tokens
+Mux::tokenize (const std::string& s)
+{
+	Tokens tokens{};
+	std::string food{s};
+	size_t pos{0};
+
+	if (s.empty())
+		return {};
+
+	while (!food.empty())
+		tokens.emplace_back(eat_token (food, pos));
+
+	return tokens;
+}
--- a/lib/parser/tokenizer.hh
+++ b/lib/parser/tokenizer.hh
@ -0,0 +1,140 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#ifndef __TOKENIZER_HH__
+#define __TOKENIZER_HH__
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <ostream>
+#include <stdexcept>
+
+// A simple tokenizer, which turns a string into a deque of tokens
+//
+// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not'
+//
+// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings
+// when we're creating the parse tree.
+//
+// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a
+// bit more context to resolve ambiguities.
+
+namespace Mux {
+
+// A token
+struct Token {
+	enum class Type {
+		Data,		/**< e .g., banana or date:..456 */
+
+		// Brackets
+		Open,		/**< ( */
+		Close,		/**< ) */
+
+		// Unops
+		Not,		/**< logical not*/
+
+		// Binops
+		And,		/**< logical and */
+		Or,             /**< logical not */
+		Xor,            /**< logical xor */
+
+		Empty,		/**< nothing */
+	};
+
+	size_t		  pos{};    /**< position in string */
+	Type		  type{};   /**< token type */
+	const std::string str{};   /**< data for this token */
+
+	/**
+	 * operator==
+	 *
+	 * @param rhs right-hand side
+	 *
+	 * @return true if rhs is equal to this; false otherwise
+	 */
+	bool operator==(const Token& rhs) const {
+		return  pos == rhs.pos &&
+			type == rhs.type &&
+			str == rhs.str;
+	}
+};
+
+/**
+ * operator<<
+ *
+ * @param os an output stream
+ * @param t a token type
+ *
+ * @return the updated output stream
+ */
+inline std::ostream&
+operator<< (std::ostream& os, Token::Type t)
+{
+	switch (t)  {
+	case Token::Type::Data:     os << "<data>"; break;
+
+	case Token::Type::Open:     os << "<open>"; break;
+	case Token::Type::Close:    os << "<close>";break;
+
+	case Token::Type::Not:      os << "<not>"; break;
+	case Token::Type::And:      os << "<and>"; break;
+	case Token::Type::Or:       os << "<or>"; break;
+	case Token::Type::Xor:      os << "<xor>"; break;
+
+	default:		// can't happen, but pacify compiler
+		throw std::runtime_error ("<<bug>>");
+	}
+
+	return os;
+}
+
+/**
+ * operator<<
+ *
+ * @param os an output stream
+ * @param t a token
+ *
+ * @return the updated output stream
+ */
+inline std::ostream&
+operator<< (std::ostream& os, const Token& t)
+{
+	os << t.pos << ": " << t.type;
+
+	if (!t.str.empty())
+		os << " [" << t.str << "]";
+
+	return os;
+}
+
+/**
+ * Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors
+ * such a missing end-".
+ *
+ * @param s a string
+ *
+ * @return a deque of tokens
+ */
+using Tokens = std::deque<Token>;
+Tokens tokenize (const std::string& s);
+
+} // namespace Mux
+
+#endif /* __TOKENIZER_HH__ */
--- a/lib/parser/tree.hh
+++ b/lib/parser/tree.hh
@ -0,0 +1,104 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <vector>
+#include <string>
+#include <iostream>
+
+#include <parser/data.hh>
+
+namespace Mux {
+
+// A node in the parse tree
+struct Node {
+	enum class Type {
+		Empty, // only for empty trees
+		OpAnd,
+		OpOr,
+		OpXor,
+		OpAndNot,
+		OpNot,
+		Value,
+		Range,
+		Invalid
+	};
+
+	Node(Type _type, std::unique_ptr<Data>&& _data):
+		type{_type}, data{std::move(_data)} {}
+	Node(Type _type): type{_type} {}
+	Node(Node&& rhs) = default;
+
+	Type			type;
+	std::unique_ptr<Data>   data;
+
+	static constexpr const char* type_name (Type t) {
+		switch (t) {
+		case Type::Empty:    return ""; break;
+		case Type::OpAnd:    return "and"; break;
+		case Type::OpOr:     return "or"; break;
+		case Type::OpXor:    return "xor"; break;
+		case Type::OpAndNot: return "andnot"; break;
+		case Type::OpNot:    return "not"; break;
+		case Type::Value:    return "value"; break;
+		case Type::Range:    return "range"; break;
+		case Type::Invalid:  return "<invalid>"; break;
+		default:
+			throw std::runtime_error ("bug");
+		}
+	}
+
+	static constexpr bool is_binop(Type t) {
+		return t == Type::OpAnd || t == Type::OpAndNot ||
+			t == Type::OpOr || t == Type::OpXor;
+	}
+};
+
+inline std::ostream&
+operator<< (std::ostream& os, const Node& t)
+{
+ 	os << Node::type_name(t.type);
+	if (t.data)
+		os << t.data;
+
+	return os;
+}
+
+struct Tree {
+	Tree(Node&& _node): node(std::move(_node)) {}
+	Tree(Tree&& rhs) = default;
+
+	void add_child (Tree&& child) { children.emplace_back(std::move(child)); }
+	bool empty() const { return node.type == Node::Type::Empty; }
+
+	Node			node;
+	std::vector<Tree>	children;
+};
+
+inline std::ostream&
+operator<< (std::ostream& os, const Tree& tree)
+{
+	os << '(' << tree.node;
+	for (const auto& subtree : tree.children)
+		os << subtree;
+	os << ')';
+
+	return os;
+}
+
+} // namespace Mux
--- a/lib/parser/utils.cc
+++ b/lib/parser/utils.cc
@ -0,0 +1,349 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#define GNU_SOURCE
+#include <stdio.h>
+#include <stdint.h>
+
+#include "utils.hh"
+
+#include <string.h>
+#include <iostream>
+#include <algorithm>
+
+#include <glib.h>
+
+using namespace Mux;
+
+namespace {
+
+static gunichar
+unichar_tolower (gunichar uc)
+{
+  if (!g_unichar_isalpha(uc))
+    return uc;
+
+  if (g_unichar_get_script (uc) != G_UNICODE_SCRIPT_LATIN)
+    return g_unichar_tolower (uc);
+
+  switch (uc)
+    {
+    case 0x00e6:
+    case 0x00c6: return 'e';   /* æ */
+    case 0x00f8: return 'o';   /* ø */
+    case 0x0110:
+    case 0x0111: return 'd';   /* đ */
+      /* todo: many more */
+    default: return g_unichar_tolower (uc);
+    }
+}
+
+/**
+ * gx_utf8_flatten:
+ * @str: a UTF-8 string
+ * @len: the length of @str, or -1 if it is %NULL-terminated
+ *
+ * Flatten some UTF-8 string; that is, downcase it and remove any diacritics.
+ *
+ * Returns: (transfer full): a flattened string, free with g_free().
+ */
+static char*
+gx_utf8_flatten (const gchar *str, gssize len)
+{
+  GString *gstr;
+  char    *norm, *cur;
+
+  g_return_val_if_fail (str, NULL);
+
+  norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL);
+  if (!norm)
+    return NULL;
+
+  gstr = g_string_sized_new (strlen (norm));
+
+  for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur))
+    {
+      gunichar uc;
+
+      uc = g_utf8_get_char (cur);
+      if (g_unichar_combining_class (uc) != 0)
+	continue;
+
+      g_string_append_unichar (gstr, unichar_tolower(uc));
+    }
+
+  g_free (norm);
+
+  return g_string_free (gstr, FALSE);
+}
+
+} // namespace
+
+
+std::string // gx_utf8_flatten
+Mux::utf8_flatten (const std::string& str)
+{
+	char *flat = gx_utf8_flatten (str.c_str(), str.length());
+	if (!flat)
+		return {};
+
+	std::string s(flat);
+	g_free (flat);
+
+	return s;
+}
+
+
+std::string
+Mux::quote (const std::string& str)
+{
+	char *s = g_strescape (str.c_str(), NULL);
+	if (!s)
+		return {};
+
+	std::string res (s);
+	g_free (s);
+
+	return "\"" + res + "\"";
+}
+
+ std::string
+ Mux::format (const char *frm, ...)
+ {
+	 va_list args;
+
+	 va_start (args, frm);
+
+	 char *s = {};
+	 const auto res = vasprintf (&s, frm, args);
+	 va_end (args);
+	 if (res == -1) {
+		 std::cerr << "string format failed" << std::endl;
+		 return {};
+	 }
+
+	 std::string str = s;
+	 free (s);
+
+	 return str;
+ }
+
+constexpr const auto InternalDateFormat = "%012" G_GINT64_FORMAT;
+constexpr const char InternalDateMin[] = "000000000000";
+constexpr const char InternalDateMax[] = "999999999999";
+static_assert(sizeof(InternalDateMin) == 12 + 1);
+static_assert(sizeof(InternalDateMax) == 12 + 1);
+
+static std::string
+date_boundary (bool is_first)
+{
+	return is_first ? InternalDateMin : InternalDateMax;
+}
+
+std::string
+Mux::date_to_time_t_string (time_t t)
+{
+	char buf[sizeof(InternalDateMax)];
+	snprintf (buf, sizeof(buf), InternalDateFormat, t);
+
+	return buf;
+}
+
+
+static std::string
+delta_ymwdhMs (const std::string& expr)
+{
+	char *endptr;
+	auto num = strtol  (expr.c_str(), &endptr, 10);
+	if (num <= 0 || num > 9999 || !endptr || !*endptr)
+		return date_boundary (true);
+
+	int years, months, weeks, days, hours, minutes, seconds;
+	years = months = weeks = days = hours = minutes = seconds = 0;
+
+	switch (endptr[0]) {
+	case 's': seconds = num; break;
+	case 'M': minutes = num; break;
+	case 'h': hours	  = num; break;
+	case 'd': days	  = num; break;
+	case 'w': weeks	  = num; break;
+	case 'm': months  = num; break;
+	case 'y': years	  = num; break;
+	default:
+		return date_boundary (true);
+	}
+
+	GDateTime *then, *now = g_date_time_new_now_local ();
+	if (weeks != 0)
+		then = g_date_time_add_weeks (now, -weeks);
+	else
+		then = g_date_time_add_full (now, -years, -months,-days,
+					     -hours, -minutes, -seconds);
+
+	time_t t = MAX (0, (gint64)g_date_time_to_unix (then));
+
+	g_date_time_unref (then);
+	g_date_time_unref (now);
+
+	return date_to_time_t_string (t);
+}
+
+
+static std::string
+special_date (const std::string& d, bool is_first)
+{
+	if (d == "now")
+		return date_to_time_t_string (time(NULL));
+
+	else if (d == "today") {
+
+		GDateTime *dt, *midnight;
+		dt	 = g_date_time_new_now_local ();
+
+		if (!is_first) {
+			GDateTime *tmp = dt;
+			dt = g_date_time_add_days (dt, 1);
+			g_date_time_unref (tmp);
+		}
+
+		midnight = g_date_time_add_full (dt, 0, 0, 0,
+						 -g_date_time_get_hour(dt),
+						 -g_date_time_get_minute (dt),
+						 -g_date_time_get_second (dt));
+		time_t t = MAX(0, (gint64)g_date_time_to_unix (midnight));
+		g_date_time_unref (dt);
+		g_date_time_unref (midnight);
+		return date_to_time_t_string ((time_t)t);
+
+	} else
+		return date_boundary (is_first);
+}
+
+
+constexpr const char UserDateMin[] = "19700101000000";
+constexpr const char UserDateMax[] = "29993112235959";
+
+std::string
+Mux::date_to_time_t_string (const std::string& dstr, bool is_first)
+{
+	gint64		 t;
+	struct tm	 tbuf;
+	GDateTime	*dtime;
+
+	/* one-sided dates */
+	if (dstr.empty())
+		return date_boundary (is_first);
+	else if (is_first && dstr.find_first_of("ymdwhMs") != std::string::npos)
+		return delta_ymwdhMs (dstr);
+
+	std::string date (is_first ? UserDateMin : UserDateMax);
+	std::copy_if (dstr.begin(), dstr.end(), date.begin(),[](auto c){return isdigit(c);});
+
+	memset (&tbuf, 0, sizeof tbuf);
+	if (!strptime (date.c_str(), "%Y%m%d%H%M%S", &tbuf) &&
+	    !strptime (date.c_str(), "%Y%m%d%H%M", &tbuf) &&
+	    !strptime (date.c_str(), "%Y%m%d", &tbuf) &&
+	    !strptime (date.c_str(), "%Y%m", &tbuf) &&
+	    !strptime (date.c_str(), "%Y", &tbuf))
+		return special_date (date, is_first);
+
+	dtime = g_date_time_new_local (tbuf.tm_year + 1900,
+				       tbuf.tm_mon + 1,
+				       tbuf.tm_mday,
+				       tbuf.tm_hour,
+				       tbuf.tm_min,
+				       tbuf.tm_sec);
+	if (!dtime) {
+		g_warning ("invalid %s date '%s'",
+			   is_first ? "lower" : "upper", date.c_str());
+		return date_boundary (is_first);
+	}
+
+	t = (gint64)g_date_time_to_unix (dtime);
+	g_date_time_unref (dtime);
+
+	if (t < 0 || t > 9999999999)
+		return date_boundary (is_first);
+	else
+		return date_to_time_t_string (t);
+}
+
+
+constexpr const auto SizeFormat = "%010" G_GINT64_FORMAT;
+
+constexpr const char SizeMin[] = "0000000000";
+constexpr const char SizeMax[] = "9999999999";
+static_assert(sizeof(SizeMin) == 10 + 1);
+static_assert(sizeof(SizeMax) == 10 + 1);
+
+static std::string
+size_boundary (bool is_first)
+{
+	return is_first ? SizeMin : SizeMax;
+}
+
+std::string
+Mux::size_to_string (int64_t size)
+{
+	char buf[sizeof(SizeMax)];
+	snprintf (buf, sizeof(buf), SizeFormat, size);
+
+	return buf;
+}
+
+std::string
+Mux::size_to_string (const std::string& val, bool is_first)
+{
+	std::string	 str;
+	GRegex		*rx;
+	GMatchInfo	*minfo;
+
+	/* one-sided ranges */
+	if (val.empty())
+		return size_boundary (is_first);
+
+	rx = g_regex_new ("(\\d+)(b|k|kb|m|mb|g|gb)?",
+			  G_REGEX_CASELESS, (GRegexMatchFlags)0, NULL);
+	minfo = NULL;
+	if (g_regex_match (rx, val.c_str(), (GRegexMatchFlags)0, &minfo)) {
+		gint64 size;
+		char *s;
+
+		s = g_match_info_fetch (minfo, 1);
+		size = atoll (s);
+		g_free (s);
+
+		s = g_match_info_fetch (minfo, 2);
+		switch (s ? g_ascii_tolower(s[0]) : 0) {
+		case 'k': size *= 1024; break;
+		case 'm': size *= (1024 * 1024); break;
+		case 'g': size *= (1024 * 1024 * 1024); break;
+		default: break;
+		}
+
+		g_free (s);
+		str = size_to_string (size);
+	} else
+		str = size_boundary (is_first);
+
+	g_regex_unref (rx);
+	g_match_info_unref (minfo);
+
+	return str;
+}
--- a/lib/parser/utils.hh
+++ b/lib/parser/utils.hh
@ -0,0 +1,100 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <string>
+
+#ifndef __UTILS_HH__
+#define __UTILS_HH__
+
+namespace Mux {
+
+/**
+ * Flatten a string -- downcase and fold diacritics etc.
+ *
+ * @param str a string
+ *
+ * @return a flattened string
+ */
+std::string utf8_flatten (const std::string& str);
+
+/**
+ * Quote & escape a string
+ *
+ * @param str a string
+ *
+ * @return quoted string
+ */
+std::string quote (const std::string& str);
+
+/**
+ * Format a string, printf style
+ *
+ * @param frm format string
+ * @param ... parameters
+ *
+ * @return a formatted string
+ */
+std::string format (const char *frm, ...)
+	__attribute__((format(printf, 1, 2)));
+
+/**
+ * Convert an ISO date to the corresponding time expressed as a string
+ * with a 10-digit time_t
+ *
+ * @param date
+ * @param first
+ *
+ * @return
+ */
+std::string date_to_time_t_string (const std::string& date, bool first);
+
+/**
+ * time_t expressed as a string with a 10-digit time_t
+ *
+ * @param t
+ *
+ * @return
+ */
+std::string date_to_time_t_string (time_t t);
+
+
+
+/**
+ * Convert a size string to a size in bytes
+ *
+ * @param sizestr the size string
+ * @param first
+ *
+ * @return the size expressed as a string with the decimal number of bytes
+ */
+std::string size_to_string (const std::string& sizestr, bool first);
+
+/**
+ * Convert a size into a size in bytes string
+ *
+ * @param size the size
+ * @param first
+ *
+ * @return the size expressed as a string with the decimal number of bytes
+ */
+std::string size_to_string (int64_t size);
+
+} // namespace Mux
+
+#endif /* __UTILS_HH__ */
--- a/lib/parser/xapian.cc
+++ b/lib/parser/xapian.cc
@ -0,0 +1,75 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include <xapian.h>
+#include "parser/xapian.hh"
+
+using namespace Mux;
+
+static Xapian::Query
+xapian_query_op (const Mux::Tree& tree)
+{
+	Xapian::Query::op op;
+
+	switch (tree.node.type) {
+	case Node::Type::OpNot: // OpNot x ::= <all> AND NOT x
+		  if (tree.children.size() != 1)
+			  throw std::runtime_error ("invalid # of children");
+		  return Xapian::Query (Xapian::Query::OP_AND_NOT,
+					Xapian::Query::MatchAll,
+					xapian_query(tree.children.front()));
+	case Node::Type::OpAnd: op    = Xapian::Query::OP_AND; break;
+	case Node::Type::OpOr:  op    = Xapian::Query::OP_OR; break;
+	case Node::Type::OpXor: op    = Xapian::Query::OP_XOR; break;
+	case Node::Type::OpAndNot: op = Xapian::Query::OP_AND_NOT; break;
+	default: throw std::runtime_error ("invalid op");	// bug
+	}
+
+	std::vector<Xapian::Query> childvec;
+	for (const auto& subtree: tree.children)
+		childvec.emplace_back(xapian_query(subtree));
+
+	return Xapian::Query(op, childvec.begin(), childvec.end());
+}
+
+Xapian::Query
+Mux::xapian_query (const Mux::Tree& tree)
+{
+	switch (tree.node.type) {
+	case Node::Type::Empty:
+		return Xapian::Query();
+	case Node::Type::OpNot:
+	case Node::Type::OpAnd:
+	case Node::Type::OpOr:
+	case Node::Type::OpXor:
+	case Node::Type::OpAndNot:
+		return xapian_query_op (tree);
+	case Node::Type::Value: {
+		const auto v = dynamic_cast<Value*> (tree.node.data.get());
+		return Xapian::Query(v->prefix + v->value);
+	}
+	case Node::Type::Range: {
+		const auto r = dynamic_cast<Range*> (tree.node.data.get());
+		return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
+				     (Xapian::valueno)r->id, r->lower, r->upper);
+	}
+	default:
+		throw std::runtime_error ("invalid query"); // bug
+	}
+}
--- a/lib/parser/xapian.hh
+++ b/lib/parser/xapian.hh
@ -0,0 +1,40 @@
+/*
+** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+
+#ifndef __XAPIAN_HH__
+#define __XAPIAN_HH__
+
+#include <xapian.h>
+#include <parser/parser.hh>
+
+namespace Mux {
+
+/**
+ * Transform a parse-tree into a Xapian query object
+ *
+ * @param tree a parse tree
+ *
+ * @return a Xapian query object
+ */
+Xapian::Query xapian_query (const Mux::Tree& tree);
+
+};
+
+#endif /* __XAPIAN_H__ */