mirror of https://github.com/djcb/mu.git
Merge branch 'wip/djcb/new-query-parser'
This commit is contained in:
commit
d528b4b818
10
NEWS.org
10
NEWS.org
|
@ -40,8 +40,14 @@
|
||||||
rather than 250000; the latter was too high for systems with limited
|
rather than 250000; the latter was too high for systems with limited
|
||||||
memory. You can of course change that with ~--batch-size=...~
|
memory. You can of course change that with ~--batch-size=...~
|
||||||
|
|
||||||
- restore expansion for path options such as ~--maildir=~/Maildir~ for shells
|
- restore expansion for path options such as ~--maildir=~/Maildir~ (to e.g.
|
||||||
that don't do that, such as Bash.
|
~/home/user/Maildir~) for shells that don't do that, such as Bash.
|
||||||
|
|
||||||
|
- updated query-parser; this is (should be) compatible with the older one,
|
||||||
|
apart from a number of fixes. There is a new option ~--analyze~ to ~mu find~
|
||||||
|
which shows the parsed query in a human-readable s-expression form; this
|
||||||
|
can be used to debug your queries (this replaces the older
|
||||||
|
~--format=mquery|xquery~)
|
||||||
|
|
||||||
*** mu4e
|
*** mu4e
|
||||||
|
|
||||||
|
|
|
@ -87,10 +87,11 @@ struct Indexer::Private {
|
||||||
was_empty_{store.empty()} {
|
was_empty_{store.empty()} {
|
||||||
|
|
||||||
mu_message("created indexer for {} -> "
|
mu_message("created indexer for {} -> "
|
||||||
"{} (batch-size: {}; was-empty: {})",
|
"{} (batch-size: {}; was-empty: {}; ngrams: {})",
|
||||||
store.root_maildir(), store.path(),
|
store.root_maildir(), store.path(),
|
||||||
store.config().get<Mu::Config::Id::BatchSize>(),
|
store.config().get<Mu::Config::Id::BatchSize>(),
|
||||||
was_empty_);
|
was_empty_,
|
||||||
|
store.config().get<Mu::Config::Id::SupportNgrams>());
|
||||||
}
|
}
|
||||||
|
|
||||||
~Private() {
|
~Private() {
|
||||||
|
@ -238,7 +239,7 @@ Indexer::Private::add_message(const std::string& path)
|
||||||
*
|
*
|
||||||
* std::unique_lock lock{w_lock_};
|
* std::unique_lock lock{w_lock_};
|
||||||
*/
|
*/
|
||||||
auto msg{Message::make_from_path(path)};
|
auto msg{Message::make_from_path(path, store_.message_options())};
|
||||||
if (!msg) {
|
if (!msg) {
|
||||||
mu_warning("failed to create message from {}: {}", path, msg.error().what());
|
mu_warning("failed to create message from {}: {}", path, msg.error().what());
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
## Copyright (C) 2021-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
## Copyright (C) 2021-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
##
|
##
|
||||||
## This program is free software; you can redistribute it and/or modify
|
## This program is free software; you can redistribute it and/or modify
|
||||||
## it under the terms of the GNU General Public License as published by
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
@ -26,16 +26,17 @@ lib_mu=static_library(
|
||||||
'mu-config.cc',
|
'mu-config.cc',
|
||||||
'mu-contacts-cache.cc',
|
'mu-contacts-cache.cc',
|
||||||
'mu-maildir.cc',
|
'mu-maildir.cc',
|
||||||
'mu-parser.cc',
|
|
||||||
'mu-query-match-deciders.cc',
|
'mu-query-match-deciders.cc',
|
||||||
'mu-query-threads.cc',
|
'mu-query-threads.cc',
|
||||||
'mu-query.cc',
|
'mu-query.cc',
|
||||||
'mu-script.cc',
|
'mu-script.cc',
|
||||||
'mu-server.cc',
|
'mu-server.cc',
|
||||||
'mu-store.cc',
|
'mu-store.cc',
|
||||||
'mu-tokenizer.cc',
|
'mu-xapian-db.cc',
|
||||||
'mu-xapian.cc',
|
# query-parser
|
||||||
'mu-xapian-db.cc'
|
'mu-query-processor.cc',
|
||||||
|
'mu-query-parser.cc',
|
||||||
|
'mu-query-xapianizer.cc'
|
||||||
],
|
],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
glib_dep,
|
glib_dep,
|
||||||
|
@ -46,8 +47,7 @@ lib_mu=static_library(
|
||||||
config_h_dep,
|
config_h_dep,
|
||||||
lib_mu_utils_dep,
|
lib_mu_utils_dep,
|
||||||
lib_mu_message_dep,
|
lib_mu_message_dep,
|
||||||
lib_mu_index_dep
|
lib_mu_index_dep],
|
||||||
],
|
|
||||||
install: false)
|
install: false)
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,14 +57,32 @@ lib_mu_dep = declare_dependency(
|
||||||
include_directories:
|
include_directories:
|
||||||
include_directories(['.', '..']))
|
include_directories(['.', '..']))
|
||||||
|
|
||||||
# dev helpers
|
#
|
||||||
tokenize = executable(
|
# query parser dev helpers
|
||||||
'tokenize',
|
#
|
||||||
[ 'mu-tokenizer.cc', 'tokenize.cc' ],
|
process_query = executable('process-query', [ 'mu-query-processor.cc'],
|
||||||
dependencies: [ lib_mu_utils_dep, glib_dep ],
|
install: false,
|
||||||
install: false)
|
cpp_args: ['-DBUILD_PROCESS_QUERY'],
|
||||||
|
dependencies: [glib_dep, lib_mu_dep])
|
||||||
|
|
||||||
# actual tests
|
parse_query = executable( 'parse-query', [ 'mu-query-parser.cc' ],
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_PARSE_QUERY'],
|
||||||
|
dependencies: [glib_dep, lib_mu_dep])
|
||||||
|
|
||||||
|
parse_query_expand = executable( 'parse-query-expand', [ 'mu-query-parser.cc' ],
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_PARSE_QUERY_EXPAND'],
|
||||||
|
dependencies: [glib_dep, lib_mu_dep])
|
||||||
|
|
||||||
|
xapian_query = executable('xapianize-query', [ 'mu-query-xapianizer.cc' ],
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_XAPIANIZE_QUERY'],
|
||||||
|
dependencies: [glib_dep, lib_mu_dep])
|
||||||
|
|
||||||
|
#
|
||||||
|
# unit tests
|
||||||
|
#
|
||||||
|
|
||||||
test('test-threads',
|
test('test-threads',
|
||||||
executable('test-threads',
|
executable('test-threads',
|
||||||
|
@ -86,4 +104,25 @@ test('test-config',
|
||||||
cpp_args: ['-DBUILD_TESTS'],
|
cpp_args: ['-DBUILD_TESTS'],
|
||||||
dependencies: [glib_dep, lib_mu_dep]))
|
dependencies: [glib_dep, lib_mu_dep]))
|
||||||
|
|
||||||
|
test('test-query-processor',
|
||||||
|
executable('test-query-processor',
|
||||||
|
'mu-query-processor.cc',
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_TESTS'],
|
||||||
|
dependencies: [lib_mu_dep]))
|
||||||
|
|
||||||
|
test('test-query-parser',
|
||||||
|
executable('test-query-parser',
|
||||||
|
'mu-query-parser.cc',
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_TESTS'],
|
||||||
|
dependencies: [lib_mu_dep]))
|
||||||
|
|
||||||
|
test('test-query-xapianizer',
|
||||||
|
executable('test-query-xapianizer',
|
||||||
|
'mu-query-xapianizer.cc',
|
||||||
|
install: false,
|
||||||
|
cpp_args: ['-DBUILD_TESTS'],
|
||||||
|
dependencies: [lib_mu_dep]))
|
||||||
|
|
||||||
subdir('tests')
|
subdir('tests')
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
## Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
## Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
##
|
##
|
||||||
## This program is free software; you can redistribute it and/or modify
|
## This program is free software; you can redistribute it and/or modify
|
||||||
## it under the terms of the GNU General Public License as published by
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
@ -38,7 +38,7 @@ lib_mu_message=static_library(
|
||||||
|
|
||||||
lib_mu_message_dep = declare_dependency(
|
lib_mu_message_dep = declare_dependency(
|
||||||
link_with: lib_mu_message,
|
link_with: lib_mu_message,
|
||||||
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ],
|
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ],
|
||||||
include_directories:
|
include_directories:
|
||||||
include_directories(['.', '..']))
|
include_directories(['.', '..']))
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
**
|
**
|
||||||
** This program is free software; you can redistribute it and/or modify it
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
** under the terms of the GNU General Public License as published by the
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
@ -16,6 +16,7 @@
|
||||||
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
**
|
**
|
||||||
*/
|
*/
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
#include "mu-document.hh"
|
#include "mu-document.hh"
|
||||||
#include "mu-message.hh"
|
#include "mu-message.hh"
|
||||||
|
@ -31,9 +32,14 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utils/mu-utils.hh>
|
#include <utils/mu-utils.hh>
|
||||||
|
|
||||||
|
|
||||||
using namespace Mu;
|
using namespace Mu;
|
||||||
|
|
||||||
|
// backward compat
|
||||||
|
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
||||||
|
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
||||||
|
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||||
|
|
||||||
|
|
||||||
const Xapian::Document&
|
const Xapian::Document&
|
||||||
Document::xapian_document() const
|
Document::xapian_document() const
|
||||||
{
|
{
|
||||||
|
@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val)
|
||||||
std::forward<SexpType>(val));
|
std::forward<SexpType>(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Xapian::TermGenerator
|
||||||
|
make_term_generator(Xapian::Document& doc, Document::Options opts)
|
||||||
|
{
|
||||||
|
Xapian::TermGenerator termgen;
|
||||||
|
|
||||||
|
if (any_of(opts & Document::Options::SupportNgrams))
|
||||||
|
termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS);
|
||||||
|
|
||||||
|
termgen.set_document(doc);
|
||||||
|
|
||||||
|
return termgen;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
|
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val,
|
||||||
|
Document::Options opts)
|
||||||
{
|
{
|
||||||
if (field.is_normal_term()) {
|
if (field.is_normal_term()) {
|
||||||
doc.add_term(field.xapian_term(val));
|
doc.add_term(field.xapian_term(val));
|
||||||
} else if (field.is_boolean_term()) {
|
} else if (field.is_boolean_term()) {
|
||||||
doc.add_boolean_term(field.xapian_term(val));
|
doc.add_boolean_term(field.xapian_term(val));
|
||||||
} else if (field.is_indexable_term()) {
|
} else if (field.is_indexable_term()) {
|
||||||
Xapian::TermGenerator termgen;
|
auto&& termgen{make_term_generator(doc, opts)};
|
||||||
termgen.set_document(doc);
|
|
||||||
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
|
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
|
||||||
/* also add as 'normal' term, so some queries where the indexer
|
/* also add as 'normal' term, so some queries where the indexer
|
||||||
* eats special chars also match */
|
* eats special chars also match */
|
||||||
|
@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val)
|
||||||
xdoc_.add_value(field.value_no(), val);
|
xdoc_.add_value(field.value_no(), val);
|
||||||
|
|
||||||
if (field.is_searchable())
|
if (field.is_searchable())
|
||||||
add_search_term(xdoc_, field, val);
|
add_search_term(xdoc_, field, val, options_);
|
||||||
|
|
||||||
if (field.include_in_sexp())
|
if (field.include_in_sexp())
|
||||||
put_prop(field, val);
|
put_prop(field, val);
|
||||||
|
@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
|
||||||
if (field.is_searchable())
|
if (field.is_searchable())
|
||||||
std::for_each(vals.begin(), vals.end(),
|
std::for_each(vals.begin(), vals.end(),
|
||||||
[&](const auto& val) {
|
[&](const auto& val) {
|
||||||
add_search_term(xdoc_, field, val); });
|
add_search_term(xdoc_, field, val, options_); });
|
||||||
|
|
||||||
if (field.include_in_sexp()) {
|
if (field.include_in_sexp()) {
|
||||||
Sexp elms{};
|
Sexp elms{};
|
||||||
|
@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts)
|
||||||
std::vector<std::string> cvec;
|
std::vector<std::string> cvec;
|
||||||
|
|
||||||
const std::string sepa2(1, SepaChar2);
|
const std::string sepa2(1, SepaChar2);
|
||||||
|
auto&& termgen{make_term_generator(xdoc_, options_)};
|
||||||
Xapian::TermGenerator termgen;
|
|
||||||
termgen.set_document(xdoc_);
|
|
||||||
|
|
||||||
for (auto&& contact: contacts) {
|
for (auto&& contact: contacts) {
|
||||||
|
|
||||||
|
|
|
@ -41,17 +41,27 @@ namespace Mu {
|
||||||
*/
|
*/
|
||||||
class Document {
|
class Document {
|
||||||
public:
|
public:
|
||||||
|
enum struct Options {
|
||||||
|
None = 0,
|
||||||
|
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
|
||||||
|
* CJK and other languages. */
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a message for a new Xapian Document
|
* Construct a message for a new Xapian Document
|
||||||
|
*
|
||||||
|
* @param flags behavioral flags
|
||||||
*/
|
*/
|
||||||
Document() {}
|
Document(Options opts = Options::None): options_{opts} {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a message document based on an existing Xapian document.
|
* Construct a message document based on an existing Xapian document.
|
||||||
*
|
*
|
||||||
* @param doc
|
* @param doc
|
||||||
|
* @param flags behavioral flags
|
||||||
*/
|
*/
|
||||||
Document(const Xapian::Document& doc): xdoc_{doc} {}
|
Document(const Xapian::Document& doc, Options opts = Options::None):
|
||||||
|
xdoc_{doc}, options_{opts} {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* DTOR
|
* DTOR
|
||||||
|
@ -240,11 +250,12 @@ private:
|
||||||
return cached_sexp_;
|
return cached_sexp_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
mutable Xapian::Document xdoc_;
|
mutable Xapian::Document xdoc_;
|
||||||
|
Options options_;
|
||||||
mutable Sexp cached_sexp_;
|
mutable Sexp cached_sexp_;
|
||||||
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
|
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
|
||||||
};
|
};
|
||||||
|
MU_ENABLE_BITOPS(Document::Options);
|
||||||
|
|
||||||
} // namepace Mu
|
} // namepace Mu
|
||||||
|
|
||||||
|
|
|
@ -207,7 +207,10 @@ struct Field {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline bool operator==(const Field& f1, const Field& f2) { return f1.id == f2.id; }
|
// equality
|
||||||
|
static inline constexpr bool operator==(const Field& f1, const Field& f2) { return f1.id == f2.id; }
|
||||||
|
static inline constexpr bool operator==(const Field& f1, const Field::Id id) { return f1.id == id; }
|
||||||
|
|
||||||
|
|
||||||
MU_ENABLE_BITOPS(Field::Flag);
|
MU_ENABLE_BITOPS(Field::Flag);
|
||||||
|
|
||||||
|
@ -594,20 +597,5 @@ Option<Field> field_from_number(size_t id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a fmt-printable representation of Field for fmt
|
|
||||||
*
|
|
||||||
* @param field a Field
|
|
||||||
*
|
|
||||||
* @return a printable representation
|
|
||||||
*/
|
|
||||||
static inline constexpr auto format_as(const Field& field) {
|
|
||||||
return field.name;
|
|
||||||
}
|
|
||||||
static inline constexpr auto format_as(const Field::Id id) {
|
|
||||||
return format_as(field_from_id(id));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace Mu
|
} // namespace Mu
|
||||||
#endif /* MU_FIELDS_HH__ */
|
#endif /* MU_FIELDS_HH__ */
|
||||||
|
|
|
@ -45,9 +45,10 @@
|
||||||
using namespace Mu;
|
using namespace Mu;
|
||||||
|
|
||||||
struct Message::Private {
|
struct Message::Private {
|
||||||
Private(Message::Options options): opts{options} {}
|
Private(Message::Options options):
|
||||||
|
opts{options}, doc{doc_opts(opts)} {}
|
||||||
Private(Message::Options options, Xapian::Document&& xdoc):
|
Private(Message::Options options, Xapian::Document&& xdoc):
|
||||||
opts{options}, doc{std::move(xdoc)} {}
|
opts{options}, doc{std::move(xdoc), doc_opts(opts)} {}
|
||||||
|
|
||||||
Message::Options opts;
|
Message::Options opts;
|
||||||
Document doc;
|
Document doc;
|
||||||
|
@ -70,6 +71,13 @@ struct Message::Private {
|
||||||
Option<std::string> embedded;
|
Option<std::string> embedded;
|
||||||
|
|
||||||
Option<std::string> language; /* body ISO language code */
|
Option<std::string> language; /* body ISO language code */
|
||||||
|
|
||||||
|
private:
|
||||||
|
Document::Options doc_opts(Message::Options mopts) {
|
||||||
|
return any_of(opts & Message::Options::SupportNgrams) ?
|
||||||
|
Document::Options::SupportNgrams :
|
||||||
|
Document::Options::None;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -176,6 +184,11 @@ Message::document() const
|
||||||
return priv_->doc;
|
return priv_->doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Message::Options
|
||||||
|
Message::options() const
|
||||||
|
{
|
||||||
|
return priv_->opts;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned
|
unsigned
|
||||||
Message::docid() const
|
Message::docid() const
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
**
|
**
|
||||||
** This program is free software; you can redistribute it and/or modify it
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
** under the terms of the GNU General Public License as published by the
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
@ -49,8 +49,10 @@ public:
|
||||||
Decrypt = 1 << 0, /**< Attempt to decrypt */
|
Decrypt = 1 << 0, /**< Attempt to decrypt */
|
||||||
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
|
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
|
||||||
* access) */
|
* access) */
|
||||||
AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename
|
AllowRelativePath = 1 << 2, /**< Allow relative paths for filename
|
||||||
* in make_from_path */
|
* in make_from_path */
|
||||||
|
SupportNgrams = 1 << 3, /**< Support ngrams, as used in
|
||||||
|
* CJK and other languages. */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -60,7 +62,6 @@ public:
|
||||||
*/
|
*/
|
||||||
Message(Message&& other) noexcept;
|
Message(Message&& other) noexcept;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* operator=
|
* operator=
|
||||||
*
|
*
|
||||||
|
@ -147,6 +148,14 @@ public:
|
||||||
const Document& document() const;
|
const Document& document() const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The message options for this message
|
||||||
|
*
|
||||||
|
* @return message options
|
||||||
|
*/
|
||||||
|
Options options() const;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the document-id, or 0 if non-existent.
|
* Get the document-id, or 0 if non-existent.
|
||||||
*
|
*
|
||||||
|
|
|
@ -51,6 +51,8 @@ struct Property {
|
||||||
PersonalAddresses, /**< List of personal e-mail addresses */
|
PersonalAddresses, /**< List of personal e-mail addresses */
|
||||||
RootMaildir, /**< Root maildir path */
|
RootMaildir, /**< Root maildir path */
|
||||||
SchemaVersion, /**< Xapian DB schema version */
|
SchemaVersion, /**< Xapian DB schema version */
|
||||||
|
SupportNgrams, /**< Support ngrams for indexing & querying
|
||||||
|
* for e.g. CJK languages */
|
||||||
/* <private> */
|
/* <private> */
|
||||||
_count_ /* Number of Ids */
|
_count_ /* Number of Ids */
|
||||||
};
|
};
|
||||||
|
@ -61,12 +63,13 @@ struct Property {
|
||||||
enum struct Flags {
|
enum struct Flags {
|
||||||
None = 0, /**< Nothing in particular */
|
None = 0, /**< Nothing in particular */
|
||||||
ReadOnly = 1 << 0, /**< Property is read-only for external use
|
ReadOnly = 1 << 0, /**< Property is read-only for external use
|
||||||
* (but can change from within the store) */
|
* (but can change from within the store) */
|
||||||
Configurable = 1 << 1, /**< A user-configurable parameter; name
|
Configurable = 1 << 1, /**< A user-configurable parameter; name
|
||||||
* starts with 'conf-' */
|
* starts with 'conf-' */
|
||||||
Internal = 1 << 2, /**< Mu-internal field */
|
Internal = 1 << 2, /**< Mu-internal field */
|
||||||
};
|
};
|
||||||
enum struct Type {
|
enum struct Type {
|
||||||
|
Boolean, /**< Some boolean value */
|
||||||
Number, /**< Some number */
|
Number, /**< Some number */
|
||||||
Timestamp, /**< Timestamp number */
|
Timestamp, /**< Timestamp number */
|
||||||
Path, /**< Path string */
|
Path, /**< Path string */
|
||||||
|
@ -176,6 +179,14 @@ public:
|
||||||
{},
|
{},
|
||||||
"Version of the Xapian database schema"
|
"Version of the Xapian database schema"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Id::SupportNgrams,
|
||||||
|
Type::Boolean,
|
||||||
|
Flags::Configurable,
|
||||||
|
"support-ngrams",
|
||||||
|
{},
|
||||||
|
"Support n-grams for working with CJK and other languages"
|
||||||
|
},
|
||||||
}};
|
}};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -229,6 +240,9 @@ public:
|
||||||
});
|
});
|
||||||
if constexpr (prop.type == Type::Number)
|
if constexpr (prop.type == Type::Number)
|
||||||
return static_cast<size_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
return static_cast<size_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
||||||
|
if constexpr (prop.type == Type::Boolean)
|
||||||
|
return static_cast<size_t>(str.empty() ? false :
|
||||||
|
std::atol(str.c_str()) != 0);
|
||||||
else if constexpr (prop.type == Type::Timestamp)
|
else if constexpr (prop.type == Type::Timestamp)
|
||||||
return static_cast<time_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
return static_cast<time_t>(str.empty() ? 0 : std::atoll(str.c_str()));
|
||||||
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
||||||
|
@ -257,6 +271,8 @@ public:
|
||||||
const auto strval = std::invoke([&]{
|
const auto strval = std::invoke([&]{
|
||||||
if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp)
|
if constexpr (prop.type == Type::Number || prop.type == Type::Timestamp)
|
||||||
return mu_format("{}", static_cast<int64_t>(val));
|
return mu_format("{}", static_cast<int64_t>(val));
|
||||||
|
if constexpr (prop.type == Type::Boolean)
|
||||||
|
return val ? "1" : "0";
|
||||||
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
else if constexpr (prop.type == Type::Path || prop.type == Type::String)
|
||||||
return std::string{val};
|
return std::string{val};
|
||||||
else if constexpr (prop.type == Type::StringList)
|
else if constexpr (prop.type == Type::StringList)
|
||||||
|
|
508
lib/mu-parser.cc
508
lib/mu-parser.cc
|
@ -1,508 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
#include "mu-parser.hh"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <limits>
|
|
||||||
|
|
||||||
#include "mu-tokenizer.hh"
|
|
||||||
#include "utils/mu-utils.hh"
|
|
||||||
#include "utils/mu-error.hh"
|
|
||||||
#include "utils/mu-regex.hh"
|
|
||||||
#include "message/mu-message.hh"
|
|
||||||
|
|
||||||
using namespace Mu;
|
|
||||||
|
|
||||||
// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND)
|
|
||||||
|
|
||||||
// query -> <term-1> | ε
|
|
||||||
// <term-1> -> <factor-1> <term-2> | ε
|
|
||||||
// <term-2> -> OR|XOR <term-1> | ε
|
|
||||||
// <factor-1> -> <unit> <factor-2> | ε
|
|
||||||
// <factor-2> -> [AND]|AND NOT <factor-1> | ε
|
|
||||||
// <unit> -> [NOT] <term-1> | ( <term-1> ) | <data>
|
|
||||||
// <data> -> <value> | <range> | <regex>
|
|
||||||
// <value> -> [field:]value
|
|
||||||
// <range> -> [field:][lower]..[upper]
|
|
||||||
// <regex> -> [field:]/regex/
|
|
||||||
|
|
||||||
#define BUG(...) \
|
|
||||||
Mu::Error(Error::Code::Internal, "BUG @ line {}", __LINE__);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none
|
|
||||||
*
|
|
||||||
* @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field
|
|
||||||
*
|
|
||||||
* @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map
|
|
||||||
* to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">]
|
|
||||||
*/
|
|
||||||
struct FieldInfo {
|
|
||||||
const std::string field;
|
|
||||||
const std::string prefix;
|
|
||||||
bool supports_phrase;
|
|
||||||
Field::Id id;
|
|
||||||
};
|
|
||||||
using FieldInfoVec = std::vector<FieldInfo>;
|
|
||||||
struct Parser::Private {
|
|
||||||
Private(const Store& store, Parser::Flags flags) : store_{store}, flags_{flags} {}
|
|
||||||
|
|
||||||
std::vector<std::string> process_regex(const std::string& field,
|
|
||||||
const Regex& rx) const;
|
|
||||||
|
|
||||||
Mu::Tree term_1(Mu::Tokens& tokens, WarningVec& warnings) const;
|
|
||||||
Mu::Tree term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const;
|
|
||||||
Mu::Tree factor_1(Mu::Tokens& tokens, WarningVec& warnings) const;
|
|
||||||
Mu::Tree factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const;
|
|
||||||
Mu::Tree unit(Mu::Tokens& tokens, WarningVec& warnings) const;
|
|
||||||
Mu::Tree data(Mu::Tokens& tokens, WarningVec& warnings) const;
|
|
||||||
Mu::Tree range(const FieldInfoVec& fields,
|
|
||||||
const std::string& lower,
|
|
||||||
const std::string& upper,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const;
|
|
||||||
Mu::Tree regex(const FieldInfoVec& fields,
|
|
||||||
const std::string& v,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const;
|
|
||||||
Mu::Tree value(const FieldInfoVec& fields,
|
|
||||||
const std::string& v,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
const Store& store_;
|
|
||||||
const Parser::Flags flags_;
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::string
|
|
||||||
process_value(const std::string& field, const std::string& value)
|
|
||||||
{
|
|
||||||
const auto id_opt{field_from_name(field)};
|
|
||||||
if (id_opt) {
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wswitch-enum"
|
|
||||||
switch (id_opt->id) {
|
|
||||||
case Field::Id::Priority: {
|
|
||||||
if (!value.empty())
|
|
||||||
return std::string(1, value[0]);
|
|
||||||
} break;
|
|
||||||
case Field::Id::Flags:
|
|
||||||
if (const auto info{flag_info(value)}; info)
|
|
||||||
return std::string(1, info->shortcut_lower());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
}
|
|
||||||
|
|
||||||
return value; // XXX prio/flags, etc. alias
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
add_field(std::vector<FieldInfo>& fields, Field::Id field_id)
|
|
||||||
{
|
|
||||||
const auto field{field_from_id(field_id)};
|
|
||||||
if (!field.shortcut)
|
|
||||||
return; // can't be searched
|
|
||||||
|
|
||||||
fields.emplace_back(FieldInfo{std::string{field.name}, field.xapian_term(),
|
|
||||||
field.is_indexable_term(), field_id});
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<FieldInfo>
|
|
||||||
process_field(const std::string& field_str, Parser::Flags flags)
|
|
||||||
{
|
|
||||||
std::vector<FieldInfo> fields;
|
|
||||||
if (any_of(flags & Parser::Flags::UnitTest)) {
|
|
||||||
add_field(fields, Field::Id::MessageId);
|
|
||||||
return fields;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (field_str == "contact" || field_str == "recip") { // multi fields
|
|
||||||
add_field(fields, Field::Id::To);
|
|
||||||
add_field(fields, Field::Id::Cc);
|
|
||||||
add_field(fields, Field::Id::Bcc);
|
|
||||||
if (field_str == "contact")
|
|
||||||
add_field(fields, Field::Id::From);
|
|
||||||
} else if (field_str.empty()) {
|
|
||||||
add_field(fields, Field::Id::To);
|
|
||||||
add_field(fields, Field::Id::Cc);
|
|
||||||
add_field(fields, Field::Id::Bcc);
|
|
||||||
add_field(fields, Field::Id::From);
|
|
||||||
add_field(fields, Field::Id::Subject);
|
|
||||||
add_field(fields, Field::Id::BodyText);
|
|
||||||
} else if (const auto field_opt{field_from_name(field_str)}; field_opt)
|
|
||||||
add_field(fields, field_opt->id);
|
|
||||||
|
|
||||||
return fields;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_range_field(const std::string& field_str)
|
|
||||||
{
|
|
||||||
if (const auto field_opt{field_from_name(field_str)}; !field_opt)
|
|
||||||
return false;
|
|
||||||
else
|
|
||||||
return field_opt->is_range();
|
|
||||||
}
|
|
||||||
|
|
||||||
struct MyRange {
|
|
||||||
std::string lower;
|
|
||||||
std::string upper;
|
|
||||||
};
|
|
||||||
|
|
||||||
static MyRange
|
|
||||||
process_range(const std::string& field_str,
|
|
||||||
const std::string& lower, const std::string& upper)
|
|
||||||
{
|
|
||||||
const auto field_opt{field_from_name(field_str)};
|
|
||||||
if (!field_opt)
|
|
||||||
return {lower, upper};
|
|
||||||
|
|
||||||
std::string l2 = lower;
|
|
||||||
std::string u2 = upper;
|
|
||||||
constexpr auto upper_limit = std::numeric_limits<int64_t>::max();
|
|
||||||
|
|
||||||
if (field_opt->id == Field::Id::Date || field_opt->id == Field::Id::Changed) {
|
|
||||||
l2 = to_lexnum(parse_date_time(lower, true).value_or(0));
|
|
||||||
u2 = to_lexnum(parse_date_time(upper, false).value_or(upper_limit));
|
|
||||||
} else if (field_opt->id == Field::Id::Size) {
|
|
||||||
l2 = to_lexnum(parse_size(lower, true).value_or(0));
|
|
||||||
u2 = to_lexnum(parse_size(upper, false).value_or(upper_limit));
|
|
||||||
}
|
|
||||||
|
|
||||||
return {l2, u2};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string>
|
|
||||||
Parser::Private::process_regex(const std::string& field_str,
|
|
||||||
const Regex& rx) const
|
|
||||||
{
|
|
||||||
const auto field_opt{field_from_name(field_str)};
|
|
||||||
if (!field_opt)
|
|
||||||
return {};
|
|
||||||
|
|
||||||
const auto prefix{field_opt->xapian_term()};
|
|
||||||
std::vector<std::string> terms;
|
|
||||||
store_.for_each_term(field_opt->id, [&](auto&& str) {
|
|
||||||
auto val{str.c_str() + 1}; // strip off the Xapian prefix.
|
|
||||||
if (rx.matches(val))
|
|
||||||
terms.emplace_back(std::move(val));
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
|
|
||||||
return terms;
|
|
||||||
}
|
|
||||||
|
|
||||||
static Token
|
|
||||||
look_ahead(const Mu::Tokens& tokens)
|
|
||||||
{
|
|
||||||
return tokens.front();
|
|
||||||
}
|
|
||||||
|
|
||||||
static Mu::Tree
|
|
||||||
empty()
|
|
||||||
{
|
|
||||||
return {{Node::Type::Empty}};
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::value(const FieldInfoVec& fields,
|
|
||||||
const std::string& v,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
auto val = utf8_flatten(v);
|
|
||||||
|
|
||||||
if (fields.empty())
|
|
||||||
throw BUG("expected one or more fields");
|
|
||||||
|
|
||||||
if (fields.size() == 1) {
|
|
||||||
const auto item = fields.front();
|
|
||||||
return Tree({Node::Type::Value,
|
|
||||||
FieldValue{item.id, process_value(item.field, val)}});
|
|
||||||
}
|
|
||||||
|
|
||||||
// a 'multi-field' such as "recip:"
|
|
||||||
Tree tree(Node{Node::Type::OpOr});
|
|
||||||
for (const auto& item : fields)
|
|
||||||
tree.add_child(Tree({Node::Type::Value,
|
|
||||||
FieldValue{item.id,
|
|
||||||
process_value(item.field, val)}}));
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::regex(const FieldInfoVec& fields,
|
|
||||||
const std::string& v,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
if (v.length() < 2)
|
|
||||||
throw BUG("expected regexp, got '%s'", v.c_str());
|
|
||||||
|
|
||||||
const auto rxstr = utf8_flatten(v.substr(1, v.length() - 2));
|
|
||||||
|
|
||||||
try {
|
|
||||||
Tree tree(Node{Node::Type::OpOr});
|
|
||||||
const auto rx = Regex::make(rxstr, G_REGEX_OPTIMIZE);
|
|
||||||
if (!rx)
|
|
||||||
throw rx.error();
|
|
||||||
for (const auto& field : fields) {
|
|
||||||
const auto terms = process_regex(field.field, *rx);
|
|
||||||
for (const auto& term : terms) {
|
|
||||||
tree.add_child(Tree({Node::Type::ValueAtomic,
|
|
||||||
FieldValue{field.id, term}}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tree.children.empty())
|
|
||||||
return empty();
|
|
||||||
else
|
|
||||||
return tree;
|
|
||||||
|
|
||||||
} catch (...) {
|
|
||||||
// fallback
|
|
||||||
warnings.push_back({pos, "invalid regexp"});
|
|
||||||
return value(fields, v, pos, warnings);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::range(const FieldInfoVec& fields,
|
|
||||||
const std::string& lower,
|
|
||||||
const std::string& upper,
|
|
||||||
size_t pos,
|
|
||||||
WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
if (fields.empty())
|
|
||||||
throw BUG("expected field");
|
|
||||||
|
|
||||||
const auto& field = fields.front();
|
|
||||||
if (!is_range_field(field.field))
|
|
||||||
return value(fields, lower + ".." + upper, pos, warnings);
|
|
||||||
|
|
||||||
auto prange = process_range(field.field, lower, upper);
|
|
||||||
if (prange.lower > prange.upper)
|
|
||||||
prange = process_range(field.field, upper, lower);
|
|
||||||
|
|
||||||
return Tree({Node::Type::Range,
|
|
||||||
FieldValue{field.id, prange.lower, prange.upper}});
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::data(Mu::Tokens& tokens, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
const auto token = look_ahead(tokens);
|
|
||||||
if (token.type != Token::Type::Data)
|
|
||||||
warnings.push_back({token.pos, "expected: value"});
|
|
||||||
|
|
||||||
tokens.pop_front();
|
|
||||||
|
|
||||||
std::string field, val;
|
|
||||||
const auto col = token.str.find(":");
|
|
||||||
if (col != 0 && col != std::string::npos && col != token.str.length() - 1) {
|
|
||||||
field = token.str.substr(0, col);
|
|
||||||
val = token.str.substr(col + 1);
|
|
||||||
} else
|
|
||||||
val = token.str;
|
|
||||||
|
|
||||||
auto fields = process_field(field, flags_);
|
|
||||||
if (fields.empty()) { // not valid field...
|
|
||||||
warnings.push_back({token.pos, mu_format("invalid field '{}'", field)});
|
|
||||||
fields = process_field("", flags_);
|
|
||||||
// fallback, treat the whole of foo:bar as a value
|
|
||||||
return value(fields, field + ":" + val, token.pos, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
// does it look like a regexp?
|
|
||||||
if (val.length() >= 2)
|
|
||||||
if (val[0] == '/' && val[val.length() - 1] == '/')
|
|
||||||
return regex(fields, val, token.pos, warnings);
|
|
||||||
|
|
||||||
// does it look like a range?
|
|
||||||
const auto dotdot = val.find("..");
|
|
||||||
if (dotdot != std::string::npos)
|
|
||||||
return range(fields,
|
|
||||||
val.substr(0, dotdot),
|
|
||||||
val.substr(dotdot + 2),
|
|
||||||
token.pos,
|
|
||||||
warnings);
|
|
||||||
else if (is_range_field(fields.front().field)) {
|
|
||||||
// range field without a range - treat as field:val..val
|
|
||||||
return range(fields, val, val, token.pos, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if nothing else, it's a value.
|
|
||||||
return value(fields, val, token.pos, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::unit(Mu::Tokens& tokens, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
if (tokens.empty()) {
|
|
||||||
warnings.push_back({0, "expected: unit"});
|
|
||||||
return empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto token = look_ahead(tokens);
|
|
||||||
|
|
||||||
if (token.type == Token::Type::Not) {
|
|
||||||
tokens.pop_front();
|
|
||||||
Tree tree{{Node::Type::OpNot}};
|
|
||||||
tree.add_child(unit(tokens, warnings));
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (token.type == Token::Type::Open) {
|
|
||||||
tokens.pop_front();
|
|
||||||
auto tree = term_1(tokens, warnings);
|
|
||||||
if (tokens.empty())
|
|
||||||
warnings.push_back({token.pos, "expected: ')'"});
|
|
||||||
else {
|
|
||||||
const auto token2 = look_ahead(tokens);
|
|
||||||
if (token2.type == Token::Type::Close)
|
|
||||||
tokens.pop_front();
|
|
||||||
else {
|
|
||||||
warnings.push_back(
|
|
||||||
{token2.pos,
|
|
||||||
std::string("expected: ')' but got ") + token2.str});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
return data(tokens, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::factor_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
if (tokens.empty())
|
|
||||||
return empty();
|
|
||||||
|
|
||||||
const auto token = look_ahead(tokens);
|
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wswitch-enum"
|
|
||||||
switch (token.type) {
|
|
||||||
case Token::Type::And: {
|
|
||||||
tokens.pop_front();
|
|
||||||
op = Node::Type::OpAnd;
|
|
||||||
} break;
|
|
||||||
|
|
||||||
case Token::Type::Open:
|
|
||||||
case Token::Type::Data:
|
|
||||||
case Token::Type::Not:
|
|
||||||
op = Node::Type::OpAnd; // implicit AND
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
return empty();
|
|
||||||
}
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
|
|
||||||
return factor_1(tokens, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::factor_1(Mu::Tokens& tokens, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
Node::Type op{Node::Type::Invalid};
|
|
||||||
|
|
||||||
auto t = unit(tokens, warnings);
|
|
||||||
auto a2 = factor_2(tokens, op, warnings);
|
|
||||||
|
|
||||||
if (a2.empty())
|
|
||||||
return t;
|
|
||||||
|
|
||||||
Tree tree{{op}};
|
|
||||||
tree.add_child(std::move(t));
|
|
||||||
tree.add_child(std::move(a2));
|
|
||||||
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::term_2(Mu::Tokens& tokens, Node::Type& op, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
if (tokens.empty())
|
|
||||||
return empty();
|
|
||||||
|
|
||||||
const auto token = look_ahead(tokens);
|
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wswitch-enum"
|
|
||||||
switch (token.type) {
|
|
||||||
case Token::Type::Or: op = Node::Type::OpOr; break;
|
|
||||||
case Token::Type::Xor: op = Node::Type::OpXor; break;
|
|
||||||
default:
|
|
||||||
if (token.type != Token::Type::Close)
|
|
||||||
warnings.push_back({token.pos, "expected OR|XOR"});
|
|
||||||
return empty();
|
|
||||||
}
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
|
|
||||||
tokens.pop_front();
|
|
||||||
|
|
||||||
return term_1(tokens, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Parser::Private::term_1(Mu::Tokens& tokens, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
Node::Type op{Node::Type::Invalid};
|
|
||||||
|
|
||||||
auto t = factor_1(tokens, warnings);
|
|
||||||
auto o2 = term_2(tokens, op, warnings);
|
|
||||||
|
|
||||||
if (o2.empty())
|
|
||||||
return t;
|
|
||||||
else {
|
|
||||||
Tree tree{{op}};
|
|
||||||
tree.add_child(std::move(t));
|
|
||||||
tree.add_child(std::move(o2));
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Parser::Parser(const Store& store, Parser::Flags flags) :
|
|
||||||
priv_{std::make_unique<Private>(store, flags)}
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Parser::~Parser() = default;
|
|
||||||
|
|
||||||
Mu::Tree
|
|
||||||
Mu::Parser::parse(const std::string& expr, WarningVec& warnings) const
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
auto tokens = tokenize(expr);
|
|
||||||
if (tokens.empty())
|
|
||||||
return empty();
|
|
||||||
else
|
|
||||||
return priv_->term_1(tokens, warnings);
|
|
||||||
|
|
||||||
} catch (const std::runtime_error& ex) {
|
|
||||||
std::cerr << ex.what() << std::endl;
|
|
||||||
return empty();
|
|
||||||
}
|
|
||||||
}
|
|
106
lib/mu-parser.hh
106
lib/mu-parser.hh
|
@ -1,106 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __PARSER_HH__
|
|
||||||
#define __PARSER_HH__
|
|
||||||
|
|
||||||
#include "utils/mu-utils.hh"
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#include <mu-tree.hh>
|
|
||||||
#include <mu-store.hh>
|
|
||||||
|
|
||||||
// A simple recursive-descent parser for queries. Follows the Xapian syntax,
|
|
||||||
// but better handles non-alphanum; also implements regexp
|
|
||||||
|
|
||||||
namespace Mu {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A parser warning
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
struct Warning {
|
|
||||||
size_t pos{}; /**< pos in string */
|
|
||||||
const std::string msg; /**< warning message */
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator==
|
|
||||||
*
|
|
||||||
* @param rhs right-hand side
|
|
||||||
*
|
|
||||||
* @return true if rhs is equal to this; false otherwise
|
|
||||||
*/
|
|
||||||
bool operator==(const Warning& rhs) const { return pos == rhs.pos && msg == rhs.msg; }
|
|
||||||
};
|
|
||||||
using WarningVec = std::vector<Warning>;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator<<
|
|
||||||
*
|
|
||||||
* @param os an output stream
|
|
||||||
* @param w a warning
|
|
||||||
*
|
|
||||||
* @return the updated output stream
|
|
||||||
*/
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, const Warning& w)
|
|
||||||
{
|
|
||||||
os << w.pos << ":" << w.msg;
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
class Parser {
|
|
||||||
public:
|
|
||||||
enum struct Flags { None = 0, UnitTest = 1 << 0 };
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct a query parser object
|
|
||||||
*
|
|
||||||
* @param store a store object ptr, or none
|
|
||||||
*/
|
|
||||||
Parser(const Store& store, Flags = Flags::None);
|
|
||||||
/**
|
|
||||||
* DTOR
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
~Parser();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Parse a query string
|
|
||||||
*
|
|
||||||
* @param query a query string
|
|
||||||
* @param warnings vec to receive warnings
|
|
||||||
*
|
|
||||||
* @return a parse-tree
|
|
||||||
*/
|
|
||||||
|
|
||||||
Tree parse(const std::string& query, WarningVec& warnings) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct Private;
|
|
||||||
std::unique_ptr<Private> priv_;
|
|
||||||
};
|
|
||||||
|
|
||||||
MU_ENABLE_BITOPS(Parser::Flags);
|
|
||||||
|
|
||||||
} // namespace Mu
|
|
||||||
|
|
||||||
#endif /* __PARSER_HH__ */
|
|
|
@ -0,0 +1,428 @@
|
||||||
|
/*
|
||||||
|
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
|
**
|
||||||
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
** Free Software Foundation; either version 3, or (at your option) any
|
||||||
|
** later version.
|
||||||
|
**
|
||||||
|
** This program is distributed in the hope that it will be useful,
|
||||||
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
** GNU General Public License for more details.
|
||||||
|
**
|
||||||
|
** You should have received a copy of the GNU General Public License
|
||||||
|
** along with this program; if not, write to the Free Software Foundation,
|
||||||
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
**
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
|
|
||||||
|
#include <string_view>
|
||||||
|
#include <variant>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "utils/mu-utils.hh"
|
||||||
|
#include "utils/mu-sexp.hh"
|
||||||
|
#include "utils/mu-option.hh"
|
||||||
|
#include <glib.h>
|
||||||
|
#include "utils/mu-utils-file.hh"
|
||||||
|
|
||||||
|
using namespace Mu;
|
||||||
|
|
||||||
|
// Sexp extensions...
|
||||||
|
static Sexp&
|
||||||
|
prepend(Sexp& s, Sexp&& e)
|
||||||
|
{
|
||||||
|
s.list().insert(s.list().begin(), std::move(e));
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Option<Sexp&>
|
||||||
|
second(Sexp& s)
|
||||||
|
{
|
||||||
|
if (s.listp() && !s.empty() && s.cbegin() + 1 != s.cend())
|
||||||
|
return *(s.begin()+1);
|
||||||
|
else
|
||||||
|
return Nothing;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool
|
||||||
|
looks_like_matcher(const Sexp& sexp)
|
||||||
|
{
|
||||||
|
// all the "terminal values" (from the Mu parser's pov)
|
||||||
|
const std::array<Sexp::Symbol, 5> value_syms = {
|
||||||
|
placeholder_sym, phrase_sym, regex_sym, range_sym, wildcard_sym
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!sexp.listp() || sexp.empty() || !sexp.front().symbolp())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const auto symbol{sexp.front().symbol()};
|
||||||
|
if (seq_some(value_syms, [&](auto &&sym) { return symbol == sym; }))
|
||||||
|
return true;
|
||||||
|
else if (!!field_from_name(symbol.name) || field_is_combi(symbol.name))
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ParseContext {
|
||||||
|
bool expand;
|
||||||
|
std::vector<std::string> warnings;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Grammar
|
||||||
|
*
|
||||||
|
* query -> factor { (<OR> | <XOR>) factor }
|
||||||
|
* factor -> unit { [<AND>] unit }
|
||||||
|
* unit -> matcher | <NOT> query | <(> query <)>
|
||||||
|
* matcher
|
||||||
|
*/
|
||||||
|
|
||||||
|
static Sexp query(Sexp& tokens, ParseContext& ctx);
|
||||||
|
|
||||||
|
static Sexp
|
||||||
|
matcher(Sexp& tokens, ParseContext& ctx)
|
||||||
|
{
|
||||||
|
if (tokens.empty())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto val{*tokens.head()};
|
||||||
|
tokens.pop_front();
|
||||||
|
|
||||||
|
/* special case: if we find some non-matcher type here, we need to
|
||||||
|
* second-guess the tokenizer */
|
||||||
|
if (!looks_like_matcher(val))
|
||||||
|
val = Sexp{placeholder_sym, val.symbol().name};
|
||||||
|
|
||||||
|
if (ctx.expand) { /* should we expand meta-fields? */
|
||||||
|
const auto symbol{val.front().symbol()};
|
||||||
|
const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name);
|
||||||
|
if (!fields.empty()) {
|
||||||
|
Sexp vals{};
|
||||||
|
vals.add(or_sym);
|
||||||
|
for (auto&& field: fields)
|
||||||
|
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
|
||||||
|
val = std::move(vals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Sexp
|
||||||
|
unit(Sexp& tokens, ParseContext& ctx)
|
||||||
|
{
|
||||||
|
if (tokens.head_symbolp(not_sym)) { /* NOT */
|
||||||
|
tokens.pop_front();
|
||||||
|
Sexp sub{query(tokens, ctx)};
|
||||||
|
|
||||||
|
/* special case: interpret "not" as a matcher instead; */
|
||||||
|
if (sub.empty())
|
||||||
|
return Sexp{placeholder_sym, not_sym.name};
|
||||||
|
|
||||||
|
/* we try to optimize: double negations are removed */
|
||||||
|
if (sub.head_symbolp(not_sym))
|
||||||
|
return *second(sub);
|
||||||
|
else
|
||||||
|
return Sexp(not_sym, std::move(sub));
|
||||||
|
|
||||||
|
} else if (tokens.head_symbolp(open_sym)) { /* ( sub) */
|
||||||
|
tokens.pop_front();
|
||||||
|
Sexp sub{query(tokens, ctx)};
|
||||||
|
if (tokens.head_symbolp(close_sym))
|
||||||
|
tokens.pop_front();
|
||||||
|
else {
|
||||||
|
//g_warning("expected <)>");
|
||||||
|
}
|
||||||
|
return sub;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* matcher */
|
||||||
|
return matcher(tokens, ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Sexp
|
||||||
|
factor(Sexp& tokens, ParseContext& ctx)
|
||||||
|
{
|
||||||
|
Sexp un = unit(tokens, ctx);
|
||||||
|
|
||||||
|
/* query 'a b' is to be interpreted as 'a AND b';
|
||||||
|
*
|
||||||
|
* we need an implicit AND if the head symbol is either
|
||||||
|
* a matcher (value) or the start of a sub-expression */
|
||||||
|
auto implicit_and = [&]() {
|
||||||
|
if (tokens.head_symbolp(open_sym))
|
||||||
|
return true;
|
||||||
|
else if (auto&& head{tokens.head()}; head)
|
||||||
|
return looks_like_matcher(*head);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
Sexp uns;
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
if (tokens.head_symbolp(and_sym))
|
||||||
|
tokens.pop_front();
|
||||||
|
else if (!implicit_and())
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (auto&& un2 = unit(tokens, ctx); !un2.empty())
|
||||||
|
uns.add(std::move(un2));
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!uns.empty()) {
|
||||||
|
un = Sexp{and_sym, std::move(un)};
|
||||||
|
un.add_list(std::move(uns));
|
||||||
|
}
|
||||||
|
|
||||||
|
return un;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Sexp
|
||||||
|
query(Sexp& tokens, ParseContext& ctx)
|
||||||
|
{
|
||||||
|
/* note: we flatten (or (or ( or ...)) etc. here;
|
||||||
|
* for optimization (since Xapian likes flat trees) */
|
||||||
|
|
||||||
|
Sexp fact = factor(tokens, ctx);
|
||||||
|
Sexp or_factors, xor_factors;
|
||||||
|
while (true) {
|
||||||
|
auto factors = std::invoke([&]()->Option<Sexp&> {
|
||||||
|
|
||||||
|
if (tokens.head_symbolp(or_sym))
|
||||||
|
return or_factors;
|
||||||
|
else if (tokens.head_symbolp(xor_sym))
|
||||||
|
return xor_factors;
|
||||||
|
else
|
||||||
|
return Nothing;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!factors)
|
||||||
|
break;
|
||||||
|
|
||||||
|
tokens.pop_front();
|
||||||
|
factors->add(factor(tokens, ctx));
|
||||||
|
}
|
||||||
|
|
||||||
|
// a bit clumsy...
|
||||||
|
|
||||||
|
if (!or_factors.empty() && xor_factors.empty()) {
|
||||||
|
fact = Sexp{or_sym, std::move(fact)};
|
||||||
|
fact.add_list(std::move(or_factors));
|
||||||
|
} else if (or_factors.empty() && !xor_factors.empty()) {
|
||||||
|
fact = Sexp{xor_sym, std::move(fact)};
|
||||||
|
fact.add_list(std::move(xor_factors));
|
||||||
|
} else if (!or_factors.empty() && !xor_factors.empty()) {
|
||||||
|
fact = Sexp{or_sym, std::move(fact)};
|
||||||
|
fact.add_list(std::move(or_factors));
|
||||||
|
prepend(xor_factors, xor_sym);
|
||||||
|
fact.add(std::move(xor_factors));
|
||||||
|
}
|
||||||
|
|
||||||
|
return fact;
|
||||||
|
}
|
||||||
|
|
||||||
|
Sexp
|
||||||
|
Mu::parse_query(const std::string& expr, bool expand)
|
||||||
|
{
|
||||||
|
ParseContext context;
|
||||||
|
context.expand = expand;
|
||||||
|
|
||||||
|
if (auto&& items = process_query(expr); !items.listp())
|
||||||
|
throw std::runtime_error("tokens must be a list-sexp");
|
||||||
|
else
|
||||||
|
return query(items, context);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(BUILD_PARSE_QUERY)||defined(BUILD_PARSE_QUERY_EXPAND)
|
||||||
|
int
|
||||||
|
main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
if (argc < 2) {
|
||||||
|
mu_printerrln("expected: {} <query>", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string expr;
|
||||||
|
for (auto i = 1; i < argc; ++i) {
|
||||||
|
expr += argv[i];
|
||||||
|
expr += " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto&& sexp = parse_query(expr,
|
||||||
|
#ifdef BUILD_PARSE_QUERY_EXPAND
|
||||||
|
true/*expand*/
|
||||||
|
#else
|
||||||
|
false/*don't expand*/
|
||||||
|
#endif
|
||||||
|
);
|
||||||
|
mu_println("{}", sexp.to_string());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif // BUILD_PARSE_QUERY || BUILD_PARSE_QUERY_EXPAND
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if BUILD_TESTS
|
||||||
|
/*
|
||||||
|
* Tests.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utils/mu-test-utils.hh"
|
||||||
|
|
||||||
|
using TestCase = std::pair<std::string, std::string>;
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_basic()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
// single term
|
||||||
|
TestCase{R"(a)", R"((_ "a"))"},
|
||||||
|
// a and b
|
||||||
|
TestCase{R"(a and b)", R"((and (_ "a") (_ "b")))"},
|
||||||
|
// a and b and c
|
||||||
|
TestCase{R"(a and b and c)", R"((and (_ "a") (_ "b") (_ "c")))"},
|
||||||
|
// a or b
|
||||||
|
TestCase{R"(a or b)", R"((or (_ "a") (_ "b")))"},
|
||||||
|
// a or b and c
|
||||||
|
TestCase{R"(a or b and c)", R"((or (_ "a") (and (_ "b") (_ "c"))))"},
|
||||||
|
// a and b or c
|
||||||
|
TestCase{R"(a and b or c)", R"((or (and (_ "a") (_ "b")) (_ "c")))"},
|
||||||
|
// not a
|
||||||
|
TestCase{R"(not a)", R"((not (_ "a")))"},
|
||||||
|
// lone not
|
||||||
|
TestCase{R"(not)", R"((_ "not"))"},
|
||||||
|
// a and (b or c)
|
||||||
|
TestCase{R"(a and (b or c))", R"((and (_ "a") (or (_ "b") (_ "c"))))"},
|
||||||
|
// TODO: add more...
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first)};
|
||||||
|
//mu_message ("'{}' <=> '{}'", sexp.to_string(), test.second);
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_recover()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
// implicit AND
|
||||||
|
TestCase{R"(a b)", R"((and (_ "a") (_ "b")))"},
|
||||||
|
// a or or (second to be used as value)
|
||||||
|
TestCase{R"(a or and)", R"((or (_ "a") (_ "and")))"},
|
||||||
|
// missing end )
|
||||||
|
TestCase{R"(a and ()", R"((_ "a"))"},
|
||||||
|
// missing end )
|
||||||
|
TestCase{R"(a and (b)", R"((and (_ "a") (_ "b")))"},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_fields()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
// simple field
|
||||||
|
TestCase{R"(s:hello)", R"((subject "hello"))"},
|
||||||
|
// field, wildcard, regexp
|
||||||
|
TestCase{R"(subject:a* recip:/b/)",
|
||||||
|
R"((and (subject (wildcard "a")) (recip (regex "b"))))"},
|
||||||
|
TestCase{R"(from:hello or subject:world)",
|
||||||
|
R"((or (from "hello") (subject "world")))"},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_expand()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
// simple field
|
||||||
|
TestCase{R"(recip:a)", R"((or (to "a") (cc "a") (bcc "a")))"},
|
||||||
|
// field, wildcard, regexp
|
||||||
|
TestCase{R"(a*)",
|
||||||
|
R"((or (to (wildcard "a")) (cc (wildcard "a")) (bcc (wildcard "a")) (from (wildcard "a")) (subject (wildcard "a")) (body (wildcard "a")) (embed (wildcard "a"))))"},
|
||||||
|
TestCase{R"(a xor contact:b)",
|
||||||
|
R"((xor (or (to "a") (cc "a") (bcc "a") (from "a") (subject "a") (body "a") (embed "a")) (or (to "b") (cc "b") (bcc "b") (from "b"))))"}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first, true/*expand*/)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_range()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
TestCase{R"(size:1)", R"((size (range "1" "1")))"},
|
||||||
|
TestCase{R"(size:2..)", R"((size (range "2" "")))"},
|
||||||
|
TestCase{R"(size:..1k)", R"((size (range "" "1024")))"},
|
||||||
|
TestCase{R"(size:..)", R"((size (range "" "")))"},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first, true/*expand*/)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_parser_optimize()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
TestCase{R"(not a)", R"((not (_ "a")))"},
|
||||||
|
TestCase{R"(not not a)", R"((_ "a"))"},
|
||||||
|
TestCase{R"(not not not a)", R"((not (_ "a")))"},
|
||||||
|
TestCase{R"(not not not not a)", R"((_ "a"))"},
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{parse_query(test.first)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
mu_test_init(&argc, &argv);
|
||||||
|
|
||||||
|
g_test_add_func("/query-parser/basic", test_parser_basic);
|
||||||
|
g_test_add_func("/query-parser/recover", test_parser_recover);
|
||||||
|
g_test_add_func("/query-parser/fields", test_parser_fields);
|
||||||
|
g_test_add_func("/query-parser/range", test_parser_range);
|
||||||
|
g_test_add_func("/query-parser/expand", test_parser_expand);
|
||||||
|
g_test_add_func("/query-parser/optimize", test_parser_optimize);
|
||||||
|
|
||||||
|
return g_test_run();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /*BUILD_TESTS*/
|
|
@ -0,0 +1,116 @@
|
||||||
|
/*
|
||||||
|
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
|
**
|
||||||
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
** Free Software Foundation; either version 3, or (at your option) any
|
||||||
|
** later version.
|
||||||
|
**
|
||||||
|
** This program is distributed in the hope that it will be useful,
|
||||||
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
** GNU General Public License for more details.
|
||||||
|
**
|
||||||
|
** You should have received a copy of the GNU General Public License
|
||||||
|
** along with this program; if not, write to the Free Software Foundation,
|
||||||
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
**
|
||||||
|
*/
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <array>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
|
#include "utils/mu-sexp.hh"
|
||||||
|
#include "utils/mu-result.hh"
|
||||||
|
#include "mu-store.hh"
|
||||||
|
|
||||||
|
namespace Mu {
|
||||||
|
/*
|
||||||
|
* Some useful symbol-sexps
|
||||||
|
*/
|
||||||
|
static inline const auto placeholder_sym = "_"_sym;
|
||||||
|
static inline const auto phrase_sym = "phrase"_sym;
|
||||||
|
static inline const auto regex_sym = "regex"_sym;
|
||||||
|
static inline const auto range_sym = "range"_sym;
|
||||||
|
static inline const auto wildcard_sym = "wildcard"_sym;
|
||||||
|
|
||||||
|
static inline const auto open_sym = "("_sym;
|
||||||
|
static inline const auto close_sym = ")"_sym;
|
||||||
|
|
||||||
|
static inline const auto and_sym = "and"_sym;
|
||||||
|
static inline const auto or_sym = "or"_sym;
|
||||||
|
static inline const auto xor_sym = "xor"_sym;
|
||||||
|
static inline const auto not_sym = "not"_sym;
|
||||||
|
static inline const auto and_not_sym = "and-not"_sym;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We take a query, then parse it into a human-readable s-expression and then
|
||||||
|
* turn that s-expression into a Xapian query
|
||||||
|
*
|
||||||
|
* some query:
|
||||||
|
* "from:hello or subject:world"
|
||||||
|
*
|
||||||
|
* 1. tokenize-query
|
||||||
|
* => ((from "hello") or (subject "world"))
|
||||||
|
*
|
||||||
|
* 2. parse-query
|
||||||
|
* => (or (from "hello") (subject "world"))
|
||||||
|
*
|
||||||
|
* 3. xapian-query
|
||||||
|
* => Query((Fhello OR Sworld))
|
||||||
|
* *
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze the query expression and express it as a Sexp-list with the sequence
|
||||||
|
* of elements.
|
||||||
|
*
|
||||||
|
* @param expr a search expression
|
||||||
|
*
|
||||||
|
* @return Sexp with the sequence of elements
|
||||||
|
*/
|
||||||
|
Sexp process_query(const std::string& expr);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse the query expression and create a parse-tree expressed as an Sexp
|
||||||
|
* object (tree).
|
||||||
|
*
|
||||||
|
* Internally, this processes the stream into element (see process_query()) and
|
||||||
|
* processes the tokens into a Sexp. This sexp is meant to be human-readable.
|
||||||
|
*
|
||||||
|
* @param expr a search expression
|
||||||
|
* @param expand whether to expand meta-fields (such as '_', 'recip', 'contacts')
|
||||||
|
*
|
||||||
|
* @return Sexp with the parse tree
|
||||||
|
*/
|
||||||
|
Sexp parse_query(const std::string& expr, bool expand=false);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make a Xapian Query for the given string expression.
|
||||||
|
*
|
||||||
|
* This uses parse_query() and turns the S-expression into a Xapian::Query.
|
||||||
|
* Unlike mere parsing, this uses the information in the store to resolve
|
||||||
|
* wildcard / regex queries.
|
||||||
|
*
|
||||||
|
* @param store the message store
|
||||||
|
* @param expr a string expression
|
||||||
|
* @param flavor type of parser to use
|
||||||
|
*
|
||||||
|
* @return a Xapian query result or an error.
|
||||||
|
*/
|
||||||
|
enum struct ParserFlags {
|
||||||
|
None = 0 << 0,
|
||||||
|
SupportNgrams = 1 << 0, /**< Support Xapian's Ngrams for CJK etc. handling */
|
||||||
|
XapianParser = 1 << 1, /**< For testing only, use Xapian's
|
||||||
|
* built-in QueryParser; this is not
|
||||||
|
* fully compatible with mu, only useful
|
||||||
|
* for debugging. */
|
||||||
|
};
|
||||||
|
Result<Xapian::Query> make_xapian_query(const Store& store, const std::string& expr,
|
||||||
|
ParserFlags flag=ParserFlags::None) noexcept;
|
||||||
|
|
||||||
|
MU_ENABLE_BITOPS(ParserFlags);
|
||||||
|
} // namespace Mu
|
|
@ -0,0 +1,548 @@
|
||||||
|
/*
|
||||||
|
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
|
**
|
||||||
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
** Free Software Foundation; either version 3, or (at your option) any
|
||||||
|
** later version.
|
||||||
|
**
|
||||||
|
** This program is distributed in the hope that it will be useful,
|
||||||
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
** GNU General Public License for more details.
|
||||||
|
**
|
||||||
|
** You should have received a copy of the GNU General Public License
|
||||||
|
** along with this program; if not, write to the Free Software Foundation,
|
||||||
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
**
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
|
|
||||||
|
#include <string_view>
|
||||||
|
#include <variant>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "utils/mu-option.hh"
|
||||||
|
#include <glib.h>
|
||||||
|
#include "utils/mu-utils-file.hh"
|
||||||
|
|
||||||
|
using namespace Mu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An 'Element' here is a rather rich version of what is traditionally
|
||||||
|
* considered a (lexical) token.
|
||||||
|
*
|
||||||
|
* We try to determine as much as possible during the analysis phase; which is
|
||||||
|
* quite a bit (given the fairly simple query language), and the parsing phase
|
||||||
|
* only has to deal with the putting these elements in a tree.
|
||||||
|
*
|
||||||
|
* During analysis:
|
||||||
|
* 1) separate the query into a sequence strings
|
||||||
|
* 2) for each of these strings
|
||||||
|
* - Does it look like an Op? ('or', 'and' etc.) --> Op
|
||||||
|
* - Otherwise: treat as a Basic field ([field]:value)
|
||||||
|
* - Whitespace in value? -> promote to Phrase
|
||||||
|
* - otherwise:
|
||||||
|
* - Is value a regex (in /<regex>/) -> promote to Regex
|
||||||
|
* - Is value a wildcard (ends in '*') -> promote to Wildcard
|
||||||
|
* - is value a range (a..b) -> promote to Range
|
||||||
|
*
|
||||||
|
* After analysis, we have the sequence of element as a Sexp, which can then be
|
||||||
|
* fed to the parser. We attempt to make the Sexp as human-readable as possible.
|
||||||
|
*/
|
||||||
|
struct Element {
|
||||||
|
enum struct Bracket { Open, Close} ;
|
||||||
|
enum struct Op { And, Or, Xor, Not, AndNot };
|
||||||
|
|
||||||
|
template<typename ValueType>
|
||||||
|
struct FieldValue {
|
||||||
|
FieldValue(const ValueType& v): field{}, value{v}{}
|
||||||
|
|
||||||
|
template<typename StringType>
|
||||||
|
FieldValue(const StringType& fname, const ValueType& v):
|
||||||
|
field{std::string{fname}}, value{v}{}
|
||||||
|
template<typename StringType>
|
||||||
|
FieldValue(const Option<StringType>& fname, const ValueType& v) {
|
||||||
|
if (fname)
|
||||||
|
field = std::string{*fname};
|
||||||
|
value = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
Option<std::string> field{};
|
||||||
|
ValueType value{};
|
||||||
|
};
|
||||||
|
struct Basic: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
|
struct Phrase: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
|
struct Regex: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
|
struct Wildcard: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
|
struct Range: public FieldValue<std::pair<std::string, std::string>> {
|
||||||
|
using FieldValue::FieldValue; };
|
||||||
|
|
||||||
|
using ValueType = std::variant<
|
||||||
|
/* */
|
||||||
|
Bracket,
|
||||||
|
/* op */
|
||||||
|
Op,
|
||||||
|
/* string values */
|
||||||
|
std::string,
|
||||||
|
/* value types */
|
||||||
|
Basic,
|
||||||
|
Phrase,
|
||||||
|
Regex,
|
||||||
|
Wildcard,
|
||||||
|
Range
|
||||||
|
>;
|
||||||
|
|
||||||
|
// helper
|
||||||
|
template <typename T, typename U>
|
||||||
|
struct decay_equiv:
|
||||||
|
std::is_same<typename std::decay<T>::type, U>::type {};
|
||||||
|
|
||||||
|
Element(Bracket b): value{b} {}
|
||||||
|
Element(Op op): value{op} {}
|
||||||
|
|
||||||
|
template<typename T,
|
||||||
|
typename std::enable_if<std::is_base_of<class FieldValue<T>, T>::value>::type = 0>
|
||||||
|
Element(const std::string& field, const T& val): value{T{field, val}} {}
|
||||||
|
|
||||||
|
Element(const std::string& val): value{val} {}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
Option<T&> get_opt() {
|
||||||
|
if (std::holds_alternative<T>(value))
|
||||||
|
return std::get<T>(value);
|
||||||
|
else
|
||||||
|
return Nothing;
|
||||||
|
}
|
||||||
|
|
||||||
|
Sexp sexp() const {
|
||||||
|
return std::visit([](auto&& arg)->Sexp {
|
||||||
|
|
||||||
|
auto field_sym = [](const Option<std::string>& field) {
|
||||||
|
return field ? Sexp::Symbol{*field} : placeholder_sym;
|
||||||
|
};
|
||||||
|
|
||||||
|
using T = std::decay_t<decltype(arg)>;
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, Bracket>) {
|
||||||
|
switch(arg) {
|
||||||
|
case Bracket::Open:
|
||||||
|
return open_sym;
|
||||||
|
case Bracket::Close:
|
||||||
|
return close_sym;
|
||||||
|
default:
|
||||||
|
throw std::logic_error("invalid bracket type");
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, Op>) {
|
||||||
|
switch(arg) {
|
||||||
|
case Op::And:
|
||||||
|
return and_sym;
|
||||||
|
case Op::Or:
|
||||||
|
return or_sym;
|
||||||
|
case Op::Xor:
|
||||||
|
return xor_sym;
|
||||||
|
case Op::Not:
|
||||||
|
return not_sym;
|
||||||
|
case Op::AndNot:
|
||||||
|
return and_not_sym;
|
||||||
|
default:
|
||||||
|
throw std::logic_error("invalid op type");
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, Basic>) {
|
||||||
|
return Sexp { field_sym(arg.field), arg.value };
|
||||||
|
} else if constexpr (std::is_same_v<T, Phrase>) {
|
||||||
|
return Sexp {field_sym(arg.field),
|
||||||
|
Sexp{ phrase_sym, arg.value }};
|
||||||
|
} else if constexpr (std::is_same_v<T, Regex>) {
|
||||||
|
return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}};
|
||||||
|
} else if constexpr (std::is_same_v<T, Wildcard>) {
|
||||||
|
return Sexp { field_sym(arg.field), Sexp{ wildcard_sym, arg.value}};
|
||||||
|
} else if constexpr (std::is_same_v<T, Range>) {
|
||||||
|
return Sexp {field_sym(arg.field),
|
||||||
|
Sexp{ range_sym, arg.value.first, arg.value.second }};
|
||||||
|
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||||
|
throw std::logic_error("no bare strings should be here");
|
||||||
|
} else
|
||||||
|
throw std::logic_error("uninvited visitor");
|
||||||
|
}, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
ValueType value;
|
||||||
|
};
|
||||||
|
|
||||||
|
using Elements = std::vector<Element>;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove first character from string and return it.
|
||||||
|
*
|
||||||
|
* @param[in,out] str a string
|
||||||
|
* @param[in,out] pos position in _original_ string
|
||||||
|
*
|
||||||
|
* @return a char or 0 if there is none.
|
||||||
|
*/
|
||||||
|
static char
|
||||||
|
read_char(std::string& str, size_t& pos)
|
||||||
|
{
|
||||||
|
if (str.empty())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto kar{str.at(0)};
|
||||||
|
str.erase(0, 1);
|
||||||
|
++pos;
|
||||||
|
|
||||||
|
return kar;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restore kar at the beginning of the string
|
||||||
|
*
|
||||||
|
* @param[in,out] str a string
|
||||||
|
* @param[in,out] pos position in _original_ string
|
||||||
|
* @param kar a character
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
unread_char(std::string& str, size_t& pos, char kar)
|
||||||
|
{
|
||||||
|
str = kar + str;
|
||||||
|
--pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove the the next element from the string and return it
|
||||||
|
*
|
||||||
|
* @param[in,out] str a string
|
||||||
|
* @param[in,out] pos position in _original_ string *
|
||||||
|
*
|
||||||
|
* @return an Element or Nothing
|
||||||
|
*/
|
||||||
|
static Option<Element>
|
||||||
|
next_element(std::string& str, size_t& pos)
|
||||||
|
{
|
||||||
|
bool quoted{}, escaped{};
|
||||||
|
std::string value{};
|
||||||
|
|
||||||
|
auto is_separator = [](char c) { return c == ' '|| c == '(' || c == ')'; };
|
||||||
|
|
||||||
|
while (!str.empty()) {
|
||||||
|
|
||||||
|
auto kar = read_char(str, pos);
|
||||||
|
|
||||||
|
if (kar == '\\') {
|
||||||
|
escaped = !escaped;
|
||||||
|
if (escaped)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kar == '"' && !escaped) {
|
||||||
|
if (!escaped && quoted)
|
||||||
|
return Element{value};
|
||||||
|
else {
|
||||||
|
quoted = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!quoted && !escaped && is_separator(kar)) {
|
||||||
|
if (!value.empty()) {
|
||||||
|
unread_char(str, pos, kar);
|
||||||
|
return Element{value};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (quoted || kar == ' ')
|
||||||
|
continue;
|
||||||
|
|
||||||
|
switch (kar) {
|
||||||
|
case '(':
|
||||||
|
return Element{Element::Bracket::Open};
|
||||||
|
case ')':
|
||||||
|
return Element{Element::Bracket::Close};
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
value += kar;
|
||||||
|
escaped = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value.empty())
|
||||||
|
return Nothing;
|
||||||
|
else
|
||||||
|
return Element{value};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Option<Element>
|
||||||
|
opify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& str{element.get_opt<std::string>()};
|
||||||
|
if (!str)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
static const std::unordered_map<std::string, Element::Op> ops = {
|
||||||
|
{ "and", Element::Op::And },
|
||||||
|
{ "or", Element::Op::Or},
|
||||||
|
{ "xor", Element::Op::Xor },
|
||||||
|
{ "not", Element::Op::Not },
|
||||||
|
// AndNot only appears during parsing.
|
||||||
|
};
|
||||||
|
|
||||||
|
if (auto&& it = ops.find(utf8_flatten(*str)); it != ops.end())
|
||||||
|
element.value = it->second;
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Option<Element>
|
||||||
|
basify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& str{element.get_opt<std::string>()};
|
||||||
|
if (!str)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
const auto pos = str->find(':');
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
element.value = Element::Basic{*str};
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto fname{str->substr(0, pos)};
|
||||||
|
if (auto&& field{field_from_name(fname)}; field) {
|
||||||
|
auto val{str->substr(pos + 1)};
|
||||||
|
if (field == Field::Id::Flags) {
|
||||||
|
if (auto&& finfo{flag_info(val)}; finfo)
|
||||||
|
element.value = Element::Basic{field->name, std::string{finfo->name}};
|
||||||
|
else
|
||||||
|
Element::Basic{*str};
|
||||||
|
} else if (field == Field::Id::Priority) {
|
||||||
|
if (auto&& prio{priority_from_name(val)}; prio)
|
||||||
|
element.value = Element::Basic{field->name,
|
||||||
|
std::string{priority_name(*prio)}};
|
||||||
|
else
|
||||||
|
element.value = Element::Basic{*str};
|
||||||
|
} else
|
||||||
|
element.value = Element::Basic{std::string{field->name},
|
||||||
|
str->substr(pos + 1)};
|
||||||
|
} else if (field_is_combi(fname))
|
||||||
|
element.value = Element::Basic{fname, str->substr(pos +1)};
|
||||||
|
else
|
||||||
|
element.value = Element::Basic{*str};
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Option<Element>
|
||||||
|
phrasify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& basic{element.get_opt<Element::Basic>()};
|
||||||
|
if (!basic)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
auto&& val{basic->value};
|
||||||
|
if (val.find(' ') != std::string::npos)
|
||||||
|
element.value = Element::Phrase{basic->field, val};
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Option<Element>
|
||||||
|
wildcardify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& basic{element.get_opt<Element::Basic>()};
|
||||||
|
if (!basic)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
auto&& val{basic->value};
|
||||||
|
if (val.size() < 2 || val[val.size()-1] != '*')
|
||||||
|
return element;
|
||||||
|
|
||||||
|
val.erase(val.size() - 1);
|
||||||
|
element.value = Element::Wildcard{basic->field, val};
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Option<Element>
|
||||||
|
regexpify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& str{element.get_opt<Element::Basic>()};
|
||||||
|
if (!str)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
auto&& val{str->value};
|
||||||
|
if (val.size() < 3 || val[0] != '/' || val[val.size()-1] != '/')
|
||||||
|
return element;
|
||||||
|
|
||||||
|
val.erase(val.size() - 1);
|
||||||
|
val.erase(0, 1);
|
||||||
|
element.value = Element::Regex{str->field, std::move(val)};
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle range-fields: Size, Date, Changed
|
||||||
|
static Option<Element>
|
||||||
|
rangify(Element&& element)
|
||||||
|
{
|
||||||
|
auto&& str{element.get_opt<Element::Basic>()};
|
||||||
|
if (!str)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
if (!str->field)
|
||||||
|
return element;
|
||||||
|
|
||||||
|
auto&& field = field_from_name(*str->field);
|
||||||
|
if (!field || !field->is_range())
|
||||||
|
return element;
|
||||||
|
|
||||||
|
/* yes: get the range */
|
||||||
|
auto&& range = std::invoke([&]()->std::pair<std::string, std::string> {
|
||||||
|
const auto val{str->value};
|
||||||
|
const auto pos{val.find("..")};
|
||||||
|
|
||||||
|
if (pos == std::string::npos)
|
||||||
|
return { val, val };
|
||||||
|
else
|
||||||
|
return {val.substr(0, pos), val.substr(pos + 2)};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (field->id == Field::Id::Size) {
|
||||||
|
int64_t s1{range.first.empty() ? -1 :
|
||||||
|
parse_size(range.first, false/*first*/).value_or(-1)};
|
||||||
|
int64_t s2{range.second.empty() ? -1 :
|
||||||
|
parse_size(range.second, true/*last*/).value_or(-1)};
|
||||||
|
if (s2 >= 0 && s1 > s2)
|
||||||
|
std::swap(s1, s2);
|
||||||
|
element.value = Element::Range{str->field,
|
||||||
|
{s1 < 0 ? "" : std::to_string(s1),
|
||||||
|
s2 < 0 ? "" : std::to_string(s2)}};
|
||||||
|
|
||||||
|
} else if (field->id == Field::Id::Date || field->id == Field::Id::Changed) {
|
||||||
|
auto tstamp=[](auto&& str, auto&& first)->int64_t {
|
||||||
|
return str.empty() ? -1 :
|
||||||
|
parse_date_time(str, first ,false/*local*/).value_or(-1);
|
||||||
|
};
|
||||||
|
int64_t lower{tstamp(range.first, true/*lower*/)};
|
||||||
|
int64_t upper{tstamp(range.second, false/*upper*/)};
|
||||||
|
if (lower >= 0 && upper >= 0 && lower > upper) {
|
||||||
|
// can't simply swap due to rounding up/down
|
||||||
|
lower = tstamp(range.second, true/*lower*/);
|
||||||
|
upper = tstamp(range.first, false/*upper*/);
|
||||||
|
}
|
||||||
|
// use "Zulu" time.
|
||||||
|
element.value = Element::Range{
|
||||||
|
str->field,
|
||||||
|
{lower < 0 ? "" :
|
||||||
|
mu_format("{:%FT%TZ}",mu_time(lower, true/*utc*/)),
|
||||||
|
upper < 0 ? "" :
|
||||||
|
mu_format("{:%FT%TZ}", mu_time(upper, true/*utc*/))}};
|
||||||
|
}
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Elements
|
||||||
|
process(const std::string& expr)
|
||||||
|
{
|
||||||
|
Elements elements{};
|
||||||
|
size_t offset{0};
|
||||||
|
|
||||||
|
/* all control chars become SPC */
|
||||||
|
std::string str{expr};
|
||||||
|
for (auto& c: str)
|
||||||
|
c = ::iscntrl(c) ? ' ' : c;
|
||||||
|
|
||||||
|
while(!str.empty()) {
|
||||||
|
auto&& element = next_element(str, offset)
|
||||||
|
.and_then(opify)
|
||||||
|
.and_then(basify)
|
||||||
|
.and_then(regexpify)
|
||||||
|
.and_then(phrasify)
|
||||||
|
.and_then(wildcardify)
|
||||||
|
.and_then(rangify);
|
||||||
|
if (element)
|
||||||
|
elements.emplace_back(std::move(element.value()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
Sexp
|
||||||
|
Mu::process_query(const std::string& expr)
|
||||||
|
{
|
||||||
|
const auto& elements{::process(expr)};
|
||||||
|
|
||||||
|
Sexp sexp{};
|
||||||
|
for (auto&& elm: elements)
|
||||||
|
sexp.add(elm.sexp());
|
||||||
|
|
||||||
|
return sexp;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef BUILD_PROCESS_QUERY
|
||||||
|
int
|
||||||
|
main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
if (argc < 2) {
|
||||||
|
mu_printerrln("expected: process-query <query>");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string expr;
|
||||||
|
for (auto i = 1; i < argc; ++i) {
|
||||||
|
expr += argv[i];
|
||||||
|
expr += " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto sexp = process_query(expr);
|
||||||
|
mu_println("{}", sexp.to_string());
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /*BUILD_ANALYZE_QUERY*/
|
||||||
|
|
||||||
|
#if BUILD_TESTS
|
||||||
|
/*
|
||||||
|
* Tests.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utils/mu-test-utils.hh"
|
||||||
|
|
||||||
|
using TestCase = std::pair<std::string, std::string>;
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_processor()
|
||||||
|
{
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"},
|
||||||
|
TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"},
|
||||||
|
TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"},
|
||||||
|
// TODO: add more...
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& sexp{process_query(test.first)};
|
||||||
|
assert_equal(sexp.to_string(), test.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
mu_test_init(&argc, &argv);
|
||||||
|
|
||||||
|
g_test_add_func("/query-parser/processor", test_processor);
|
||||||
|
|
||||||
|
return g_test_run();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /*BUILD_TESTS*/
|
|
@ -0,0 +1,468 @@
|
||||||
|
/*
|
||||||
|
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
|
**
|
||||||
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
** Free Software Foundation; either version 3, or (at your option) any
|
||||||
|
** later version.
|
||||||
|
**
|
||||||
|
** This program is distributed in the hope that it will be useful,
|
||||||
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
** GNU General Public License for more details.
|
||||||
|
**
|
||||||
|
** You should have received a copy of the GNU General Public License
|
||||||
|
** along with this program; if not, write to the Free Software Foundation,
|
||||||
|
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
**
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
|
|
||||||
|
#include <string_view>
|
||||||
|
#include <variant>
|
||||||
|
#include <array>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "utils/mu-option.hh"
|
||||||
|
#include <glib.h>
|
||||||
|
#include "utils/mu-utils-file.hh"
|
||||||
|
|
||||||
|
using namespace Mu;
|
||||||
|
|
||||||
|
// backward compat
|
||||||
|
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
||||||
|
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
||||||
|
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expand terms for scripts without explicit word-breaks (e.g.
|
||||||
|
* Chinese/Japanese/Korean) in the way that Xapian expects it -
|
||||||
|
* use Xapian's built-in QueryParser just for that.
|
||||||
|
*/
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
ngram_expand(const Field& field, const std::string& str)
|
||||||
|
{
|
||||||
|
Xapian::QueryParser qp;
|
||||||
|
const auto pfx{std::string(1U, field.xapian_prefix())};
|
||||||
|
|
||||||
|
qp.set_default_op(Xapian::Query::OP_OR);
|
||||||
|
|
||||||
|
return qp.parse_query(str, Xapian::QueryParser::FLAG_NGRAMS, pfx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Option<Sexp>
|
||||||
|
tail(Sexp&& s)
|
||||||
|
{
|
||||||
|
if (!s.listp() || s.empty())
|
||||||
|
return Nothing;
|
||||||
|
|
||||||
|
s.list().erase(s.list().begin(), s.list().begin() + 1);
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Option<std::string>
|
||||||
|
head_symbol(const Sexp& s)
|
||||||
|
{
|
||||||
|
if (!s.listp() || s.empty() || !s.head() || !s.head()->symbolp())
|
||||||
|
return Nothing;
|
||||||
|
|
||||||
|
return s.head()->symbol().name;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Option<std::string>
|
||||||
|
string_nth(const Sexp& args, size_t n)
|
||||||
|
{
|
||||||
|
if (!args.listp() || args.size() < n + 1)
|
||||||
|
return Nothing;
|
||||||
|
|
||||||
|
if (auto&& item{args.list().at(n)}; !item.stringp())
|
||||||
|
return Nothing;
|
||||||
|
else
|
||||||
|
return item.string();
|
||||||
|
}
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
phrase(const Field& field, Sexp&& s)
|
||||||
|
{
|
||||||
|
if (!field.is_indexable_term())
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"field {} does not support phrases", field.name);
|
||||||
|
|
||||||
|
if (s.size() == 1 && s.front().stringp()) {
|
||||||
|
auto&& words{split(s.front().string(), " ")};
|
||||||
|
std::vector<Xapian::Query> phvec;
|
||||||
|
phvec.reserve(words.size());
|
||||||
|
for(auto&& w: words)
|
||||||
|
phvec.emplace_back(Xapian::Query{field.xapian_term(std::move(w))});
|
||||||
|
return Xapian::Query{Xapian::Query::OP_PHRASE,
|
||||||
|
phvec.begin(), phvec.end()};
|
||||||
|
} else
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"invalid phrase for field {}: '{}'", field.name, s.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
regex(const Store& store, const Field& field, const std::string& rx_str)
|
||||||
|
{
|
||||||
|
auto&& str{utf8_flatten(rx_str)};
|
||||||
|
auto&& rx{Regex::make(str, G_REGEX_OPTIMIZE)};
|
||||||
|
if (!rx) {
|
||||||
|
mu_warning("invalid regexp: '{}': {}", str, rx.error().what());
|
||||||
|
return Xapian::Query::MatchNothing;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Xapian::Query> rxvec;
|
||||||
|
store.for_each_term(field.id, [&](auto&& str) {
|
||||||
|
if (auto&& val{str.data() + 1}; rx->matches(val))
|
||||||
|
rxvec.emplace_back(field.xapian_term(std::string_view{val}));
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
return Xapian::Query(Xapian::Query::OP_OR, rxvec.begin(), rxvec.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
range(const Field& field, Sexp&& s)
|
||||||
|
{
|
||||||
|
auto&& r0{string_nth(s, 0)};
|
||||||
|
auto&& r1{string_nth(s, 1)};
|
||||||
|
if (!r0 || !r1)
|
||||||
|
return Err(Error::Code::InvalidArgument, "expected 2 range values");
|
||||||
|
|
||||||
|
// in the sexp, we use iso date/time for human readability; now convert to
|
||||||
|
// time_t
|
||||||
|
auto iso_to_lexnum=[](const std::string& s)->Option<std::string> {
|
||||||
|
if (s.empty())
|
||||||
|
return s;
|
||||||
|
if (auto&& t{parse_date_time(s, true, true/*utc*/)}; !t)
|
||||||
|
return Nothing;
|
||||||
|
else
|
||||||
|
return to_lexnum(*t);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (field == Field::Id::Date || field == Field::Id::Changed) {
|
||||||
|
// iso -> time_t
|
||||||
|
r0 = iso_to_lexnum(*r0);
|
||||||
|
r1 = iso_to_lexnum(*r1);
|
||||||
|
} else if (field == Field::Id::Size) {
|
||||||
|
if (!r0->empty())
|
||||||
|
r0 = to_lexnum(::atoll(r0->c_str()));
|
||||||
|
if (!r1->empty())
|
||||||
|
r1 = to_lexnum(::atoll(r1->c_str()));
|
||||||
|
} else
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"unsupported range field {}", field.name);
|
||||||
|
|
||||||
|
if (r0->empty() && r1->empty())
|
||||||
|
return Xapian::Query::MatchAll;
|
||||||
|
else if (r0->empty() && !r1->empty())
|
||||||
|
return Xapian::Query(Xapian::Query::OP_VALUE_LE,
|
||||||
|
field.value_no(), *r1);
|
||||||
|
else if (!r0->empty() && r1->empty())
|
||||||
|
return Xapian::Query(Xapian::Query::OP_VALUE_GE,
|
||||||
|
field.value_no(), *r0);
|
||||||
|
else
|
||||||
|
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
|
||||||
|
field.value_no(), *r0, *r1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
using OpPair = std::pair<const std::string_view, Xapian::Query::op>;
|
||||||
|
static constexpr std::array<OpPair, 4> LogOpPairs = {{
|
||||||
|
{ "and", Xapian::Query::OP_AND },
|
||||||
|
{ "or", Xapian::Query::OP_OR },
|
||||||
|
{ "xor", Xapian::Query::OP_XOR },
|
||||||
|
{ "not", Xapian::Query::OP_AND_NOT }
|
||||||
|
}};
|
||||||
|
|
||||||
|
static Option<Xapian::Query::op>
|
||||||
|
find_log_op(const std::string& opname)
|
||||||
|
{
|
||||||
|
for (auto&& p: LogOpPairs)
|
||||||
|
if (p.first == opname)
|
||||||
|
return p.second;
|
||||||
|
|
||||||
|
return Nothing;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Result<Xapian::Query> parse(const Store& store, Sexp&& s, Mu::ParserFlags flags);
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
parse_logop(const Store& store, Xapian::Query::op op, Sexp&& args, Mu::ParserFlags flags)
|
||||||
|
{
|
||||||
|
if (!args.listp() || args.empty())
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected non-empty list but got", args.to_string());
|
||||||
|
|
||||||
|
std::vector<Xapian::Query> qs;
|
||||||
|
for (auto&& elm: args.list()) {
|
||||||
|
if (auto&& q{parse(store, std::move(elm), flags)}; !q)
|
||||||
|
return Err(std::move(q.error()));
|
||||||
|
else
|
||||||
|
qs.emplace_back(std::move(*q));
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(op) {
|
||||||
|
case Xapian::Query::OP_AND_NOT:
|
||||||
|
if (qs.size() != 1)
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected single argument for NOT");
|
||||||
|
else
|
||||||
|
return Xapian::Query{op, Xapian::Query::MatchAll, qs.at(0)};
|
||||||
|
|
||||||
|
case Xapian::Query::OP_AND:
|
||||||
|
case Xapian::Query::OP_OR:
|
||||||
|
case Xapian::Query::OP_XOR:
|
||||||
|
return Xapian::Query(op, qs.begin(), qs.end());
|
||||||
|
|
||||||
|
default:
|
||||||
|
return Err(Error::Code::InvalidArgument, "unexpected xapian op");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
parse_field_matcher(const Store& store, const Field& field,
|
||||||
|
const std::string& match_sym, Sexp&& args)
|
||||||
|
{
|
||||||
|
auto&& str0{string_nth(args, 0)};
|
||||||
|
|
||||||
|
if (match_sym == wildcard_sym.name && str0)
|
||||||
|
return Xapian::Query{Xapian::Query::OP_WILDCARD,
|
||||||
|
field.xapian_term(*str0)};
|
||||||
|
else if (match_sym == range_sym.name && !!str0)
|
||||||
|
return range(field, std::move(args));
|
||||||
|
else if (match_sym == regex_sym.name && !!str0)
|
||||||
|
return regex(store, field, *str0);
|
||||||
|
else if (match_sym == phrase_sym.name)
|
||||||
|
return phrase(field, std::move(args));
|
||||||
|
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"invalid field '{}'/'{}' matcher: {}",
|
||||||
|
field.name, match_sym, args.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static Result<Xapian::Query> parse_basic(const Field &field, Sexp &&vals,
|
||||||
|
Mu::ParserFlags flags)
|
||||||
|
{
|
||||||
|
auto ngrams = any_of(flags & ParserFlags::SupportNgrams);
|
||||||
|
if (!vals.stringp())
|
||||||
|
return Err(Error::Code::InvalidArgument, "expected string");
|
||||||
|
|
||||||
|
auto&& val{vals.string()};
|
||||||
|
|
||||||
|
switch (field.id) {
|
||||||
|
case Field::Id::Flags:
|
||||||
|
if (auto&& finfo{flag_info(val)}; finfo)
|
||||||
|
return Xapian::Query{field.xapian_term(finfo->shortcut_lower())};
|
||||||
|
else
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"invalid flag '{}'", val);
|
||||||
|
case Field::Id::Priority:
|
||||||
|
if (auto&& prio{priority_from_name(val)}; prio)
|
||||||
|
return Xapian::Query{field.xapian_term(to_char(*prio))};
|
||||||
|
else
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"invalid priority '{}'", val);
|
||||||
|
default: {
|
||||||
|
auto q{Xapian::Query{field.xapian_term(val)}};
|
||||||
|
if (ngrams) { // special case: cjk; see if we can create an expanded query.
|
||||||
|
if (field.is_indexable_term() && contains_unbroken_script(val))
|
||||||
|
if (auto&& ng{ngram_expand(field, val)}; ng)
|
||||||
|
return ng;
|
||||||
|
}
|
||||||
|
return q;
|
||||||
|
}}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static Result<Xapian::Query>
|
||||||
|
parse(const Store& store, Sexp&& s, Mu::ParserFlags flags)
|
||||||
|
{
|
||||||
|
auto&& headsym{head_symbol(s)};
|
||||||
|
if (!headsym)
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected (symbol ...) but got {}", s.to_string());
|
||||||
|
|
||||||
|
// ie., something like (or|and| ... ....)
|
||||||
|
if (auto&& logop{find_log_op(*headsym)}; logop) {
|
||||||
|
if (auto&& args{tail(std::move(s))}; !args)
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected (logop ...) but got {}",
|
||||||
|
s.to_string());
|
||||||
|
else
|
||||||
|
return parse_logop(store, *logop, std::move(*args), flags);
|
||||||
|
|
||||||
|
}
|
||||||
|
// something like (field ...)
|
||||||
|
else if (auto&& field{field_from_name(*headsym)}; field) {
|
||||||
|
|
||||||
|
auto&& rest{tail(std::move(s))};
|
||||||
|
if (!rest || rest->empty())
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected field-value or field-matcher");
|
||||||
|
|
||||||
|
auto&& matcher{rest->front()};
|
||||||
|
// field-value: (field "value"); ensure "value" is there
|
||||||
|
if (matcher.stringp())
|
||||||
|
return parse_basic(*field, std::move(matcher), flags);
|
||||||
|
|
||||||
|
// otherwise, we expect a field-matcher, e.g. (field (phrase "a b c"))
|
||||||
|
// ensure the matcher is a list starting with a symbol
|
||||||
|
auto&& match_sym{head_symbol(matcher)};
|
||||||
|
if (!match_sym)
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"expected field-matcher");
|
||||||
|
|
||||||
|
if (auto&& args{tail(std::move(matcher))}; !args)
|
||||||
|
return Err(Error::Code::InvalidArgument, "expected matcher arguments");
|
||||||
|
else
|
||||||
|
return parse_field_matcher(store, *field,
|
||||||
|
*match_sym, std::move(*args));
|
||||||
|
}
|
||||||
|
return Err(Error::Code::InvalidArgument,
|
||||||
|
"unexpected sexp {}", s.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// parse the way Xapian's internal parser does it; for testing.
|
||||||
|
static Xapian::Query
|
||||||
|
xapian_query_classic(const std::string& expr, Mu::ParserFlags flags)
|
||||||
|
{
|
||||||
|
Xapian::QueryParser xqp;
|
||||||
|
|
||||||
|
// add prefixes
|
||||||
|
field_for_each([&](auto&& field){
|
||||||
|
|
||||||
|
if (!field.is_searchable())
|
||||||
|
return;
|
||||||
|
|
||||||
|
const auto prefix{std::string(1U, field.xapian_prefix())};
|
||||||
|
std::vector<std::string> names = {
|
||||||
|
std::string{field.name},
|
||||||
|
std::string(1U, field.shortcut)
|
||||||
|
};
|
||||||
|
if (!field.alias.empty())
|
||||||
|
names.emplace_back(std::string{field.alias});
|
||||||
|
|
||||||
|
for (auto&& name: names)
|
||||||
|
xqp.add_prefix(name, prefix);
|
||||||
|
});
|
||||||
|
|
||||||
|
const auto xflags = std::invoke([&]() {
|
||||||
|
unsigned f = Xapian::QueryParser::FLAG_PHRASE |
|
||||||
|
Xapian::QueryParser::FLAG_BOOLEAN |
|
||||||
|
Xapian::QueryParser::FLAG_WILDCARD;
|
||||||
|
if (any_of(flags & ParserFlags::SupportNgrams)) {
|
||||||
|
#if HAVE_XAPIAN_FLAG_NGRAMS
|
||||||
|
f |= Xapian::QueryParser::FLAG_NGRAMS;
|
||||||
|
#else
|
||||||
|
f |= Xapian::QueryParser::FLAG_CJK_NGRAM;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return f;
|
||||||
|
});
|
||||||
|
|
||||||
|
xqp.set_default_op(Xapian::Query::OP_AND);
|
||||||
|
return xqp.parse_query(expr, xflags);
|
||||||
|
}
|
||||||
|
|
||||||
|
Result<Xapian::Query>
|
||||||
|
Mu::make_xapian_query(const Store& store, const std::string& expr, Mu::ParserFlags flags) noexcept
|
||||||
|
{
|
||||||
|
if (any_of(flags & Mu::ParserFlags::XapianParser))
|
||||||
|
return xapian_query_classic(expr, flags);
|
||||||
|
|
||||||
|
return parse(store, Mu::parse_query(expr, true/*expand*/), flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef BUILD_XAPIANIZE_QUERY
|
||||||
|
int
|
||||||
|
main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
if (argc < 2) {
|
||||||
|
mu_printerrln("expected: parse-query <query>");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto store = Store::make(runtime_path(Mu::RuntimePath::XapianDb));
|
||||||
|
if (!store) {
|
||||||
|
mu_printerrln("error: {}", store.error());
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string expr;
|
||||||
|
for (auto i = 1; i < argc; ++i) {
|
||||||
|
expr += argv[i];
|
||||||
|
expr += " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (auto&& query{make_xapian_query(*store, expr)}; !query) {
|
||||||
|
mu_printerrln("error: {}", query.error());
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
mu_println("{}", query->get_description());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /*BUILD_XAPIANIZE_QUERY*/
|
||||||
|
|
||||||
|
#if BUILD_TESTS
|
||||||
|
/*
|
||||||
|
* Tests.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utils/mu-test-utils.hh"
|
||||||
|
|
||||||
|
using TestCase = std::pair<std::string, std::string>;
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_xapian()
|
||||||
|
{
|
||||||
|
auto&& testhome{unwrap(make_temp_dir())};
|
||||||
|
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
|
||||||
|
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
|
||||||
|
|
||||||
|
std::vector<TestCase> cases = {
|
||||||
|
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
|
||||||
|
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
|
||||||
|
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
|
||||||
|
TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"},
|
||||||
|
TestCase{R"(subject:/boo/")", R"(Query())"},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto&& xq{make_xapian_query(store, test.first)};
|
||||||
|
assert_valid_result(xq);
|
||||||
|
|
||||||
|
mu_println("'{}' <=> '{}'", xq->get_description(), test.second);
|
||||||
|
assert_equal(xq->get_description(), test.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_directory(testhome);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
mu_test_init(&argc, &argv);
|
||||||
|
|
||||||
|
Xapian::QueryParser qp;
|
||||||
|
g_test_add_func("/query-parser/xapianizer", test_xapian);
|
||||||
|
|
||||||
|
return g_test_run();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /*BUILD_TESTS*/
|
|
@ -32,15 +32,17 @@
|
||||||
#include "mu-query-results.hh"
|
#include "mu-query-results.hh"
|
||||||
#include "mu-query-match-deciders.hh"
|
#include "mu-query-match-deciders.hh"
|
||||||
#include "mu-query-threads.hh"
|
#include "mu-query-threads.hh"
|
||||||
#include <mu-xapian.hh>
|
|
||||||
#include "mu-xapian-db.hh"
|
#include "mu-xapian-db.hh"
|
||||||
|
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
|
|
||||||
using namespace Mu;
|
using namespace Mu;
|
||||||
|
|
||||||
struct Query::Private {
|
struct Query::Private {
|
||||||
Private(const Store& store) : store_{store}, parser_{store_} {}
|
Private(const Store& store) :
|
||||||
// New
|
store_{store},
|
||||||
// bool calculate_threads (Xapian::Enquire& enq, size maxnum);
|
parser_flags_{any_of(store_.message_options() & Message::Options::SupportNgrams) ?
|
||||||
|
ParserFlags::SupportNgrams : ParserFlags::None} {}
|
||||||
|
|
||||||
Xapian::Enquire make_enquire(const std::string& expr, Field::Id sortfield_id,
|
Xapian::Enquire make_enquire(const std::string& expr, Field::Id sortfield_id,
|
||||||
QueryFlags qflags) const;
|
QueryFlags qflags) const;
|
||||||
|
@ -61,7 +63,7 @@ struct Query::Private {
|
||||||
Field::Id sortfield_id, QueryFlags qflags,
|
Field::Id sortfield_id, QueryFlags qflags,
|
||||||
size_t maxnum) const;
|
size_t maxnum) const;
|
||||||
const Store& store_;
|
const Store& store_;
|
||||||
const Parser parser_;
|
const ParserFlags parser_flags_;
|
||||||
};
|
};
|
||||||
|
|
||||||
Query::Query(const Store& store) : priv_{std::make_unique<Private>(store)} {}
|
Query::Query(const Store& store) : priv_{std::make_unique<Private>(store)} {}
|
||||||
|
@ -79,22 +81,27 @@ sort_enquire(Xapian::Enquire& enq, Field::Id sortfield_id, QueryFlags qflags)
|
||||||
return enq;
|
return enq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Xapian::Query
|
||||||
|
make_query(const Store& store, const std::string& expr, ParserFlags parser_flags)
|
||||||
|
{
|
||||||
|
if (expr.empty() || expr == R"("")")
|
||||||
|
return Xapian::Query::MatchAll;
|
||||||
|
else {
|
||||||
|
if (auto&& q{make_xapian_query(store, expr, parser_flags)}; !q) {
|
||||||
|
mu_warning("error in query '{}': {}", expr, q.error().what());
|
||||||
|
return Xapian::Query::MatchNothing;
|
||||||
|
} else
|
||||||
|
return q.value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Xapian::Enquire
|
Xapian::Enquire
|
||||||
Query::Private::make_enquire(const std::string& expr,
|
Query::Private::make_enquire(const std::string& expr,
|
||||||
Field::Id sortfield_id,
|
Field::Id sortfield_id,
|
||||||
QueryFlags qflags) const
|
QueryFlags qflags) const
|
||||||
{
|
{
|
||||||
auto enq{store_.xapian_db().enquire()};
|
auto enq{store_.xapian_db().enquire()};
|
||||||
if (expr.empty() || expr == R"("")")
|
enq.set_query(make_query(store_, expr, parser_flags_));
|
||||||
enq.set_query(Xapian::Query::MatchAll);
|
|
||||||
else {
|
|
||||||
WarningVec warns;
|
|
||||||
const auto tree{parser_.parse(expr, warns)};
|
|
||||||
for (auto&& w : warns)
|
|
||||||
mu_warning("query warning: {}", to_string(w));
|
|
||||||
enq.set_query(xapian_query(tree));
|
|
||||||
}
|
|
||||||
|
|
||||||
sort_enquire(enq, sortfield_id, qflags);
|
sort_enquire(enq, sortfield_id, qflags);
|
||||||
|
|
||||||
return enq;
|
return enq;
|
||||||
|
@ -122,8 +129,7 @@ Query::Private::make_related_enquire(const StringSet& thread_ids,
|
||||||
|
|
||||||
struct ThreadKeyMaker : public Xapian::KeyMaker {
|
struct ThreadKeyMaker : public Xapian::KeyMaker {
|
||||||
ThreadKeyMaker(const QueryMatches& matches) : match_info_(matches) {}
|
ThreadKeyMaker(const QueryMatches& matches) : match_info_(matches) {}
|
||||||
std::string operator()(const Xapian::Document& doc) const override
|
std::string operator()(const Xapian::Document& doc) const override {
|
||||||
{
|
|
||||||
const auto it{match_info_.find(doc.get_docid())};
|
const auto it{match_info_.find(doc.get_docid())};
|
||||||
return (it == match_info_.end()) ? "" : it->second.thread_path;
|
return (it == match_info_.end()) ? "" : it->second.thread_path;
|
||||||
}
|
}
|
||||||
|
@ -257,10 +263,13 @@ Query::run(const std::string& expr, Field::Id sortfield_id,
|
||||||
g_return_val_if_fail(none_of(qflags & QueryFlags::Leader),
|
g_return_val_if_fail(none_of(qflags & QueryFlags::Leader),
|
||||||
Err(Error::Code::InvalidArgument, "cannot pass Leader flag"));
|
Err(Error::Code::InvalidArgument, "cannot pass Leader flag"));
|
||||||
|
|
||||||
StopWatch sw{mu_format(
|
StopWatch sw{
|
||||||
"ran query '{}'; related: {}; threads: {}; max-size: {}", expr,
|
mu_format("query: '{}'; (related:{}; threads:{}; ngrams:{}; max-size:{})",
|
||||||
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
|
expr,
|
||||||
any_of(qflags & QueryFlags::Threading) ? "yes" : "no", maxnum)};
|
any_of(qflags & QueryFlags::IncludeRelated) ? "yes" : "no",
|
||||||
|
any_of(qflags & QueryFlags::Threading) ? "yes" : "no",
|
||||||
|
any_of(priv_->parser_flags_ & ParserFlags::SupportNgrams) ? "yes" : "no",
|
||||||
|
maxnum == 0 ? std::string{"∞"} : std::to_string(maxnum))};
|
||||||
|
|
||||||
return xapian_try_result([&]{
|
return xapian_try_result([&]{
|
||||||
if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res)
|
if (auto&& res = priv_->run(expr, sortfield_id, qflags, maxnum); res)
|
||||||
|
@ -288,14 +297,10 @@ Query::count(const std::string& expr) const
|
||||||
std::string
|
std::string
|
||||||
Query::parse(const std::string& expr, bool xapian) const
|
Query::parse(const std::string& expr, bool xapian) const
|
||||||
{
|
{
|
||||||
WarningVec warns;
|
|
||||||
const auto tree{priv_->parser_.parse(expr, warns)};
|
|
||||||
for (auto&& w : warns)
|
|
||||||
mu_warning("query warning: {}", to_string(w));
|
|
||||||
|
|
||||||
if (xapian)
|
if (xapian)
|
||||||
return xapian_query(tree).get_description();
|
return make_query(priv_->store_, expr,
|
||||||
|
priv_->parser_flags_).get_description();
|
||||||
else
|
else
|
||||||
return to_string(tree);
|
return parse_query(expr).to_string();
|
||||||
}
|
}
|
||||||
/* LCOV_EXCL_STOP*/
|
/* LCOV_EXCL_STOP*/
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
** Copyright (C) 2008-2021 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
** Copyright (C) 2008-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
**
|
**
|
||||||
** This program is free software; you can redistribute it and/or modify
|
** This program is free software; you can redistribute it and/or modify
|
||||||
** it under the terms of the GNU General Public License as published by
|
** it under the terms of the GNU General Public License as published by
|
||||||
|
|
|
@ -70,7 +70,8 @@ struct Store::Private {
|
||||||
: XapianDb::Flavor::Open)},
|
: XapianDb::Flavor::Open)},
|
||||||
config_{xapian_db_},
|
config_{xapian_db_},
|
||||||
contacts_cache_{config_},
|
contacts_cache_{config_},
|
||||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
|
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
|
||||||
|
message_opts_{make_message_options(config_)}
|
||||||
{}
|
{}
|
||||||
|
|
||||||
Private(const std::string& path, const std::string& root_maildir,
|
Private(const std::string& path, const std::string& root_maildir,
|
||||||
|
@ -78,7 +79,8 @@ struct Store::Private {
|
||||||
xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)},
|
xapian_db_{XapianDb(path, XapianDb::Flavor::CreateOverwrite)},
|
||||||
config_{make_config(xapian_db_, root_maildir, conf)},
|
config_{make_config(xapian_db_, root_maildir, conf)},
|
||||||
contacts_cache_{config_},
|
contacts_cache_{config_},
|
||||||
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())}
|
root_maildir_{remove_slash(config_.get<Config::Id::RootMaildir>())},
|
||||||
|
message_opts_{make_message_options(config_)}
|
||||||
{}
|
{}
|
||||||
|
|
||||||
~Private() try {
|
~Private() try {
|
||||||
|
@ -133,6 +135,13 @@ struct Store::Private {
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Message::Options make_message_options(const Config& conf) {
|
||||||
|
if (conf.get<Config::Id::SupportNgrams>())
|
||||||
|
return Message::Options::SupportNgrams;
|
||||||
|
else
|
||||||
|
return Message::Options::None;
|
||||||
|
}
|
||||||
|
|
||||||
Option<Message> find_message_unlocked(Store::Id docid) const;
|
Option<Message> find_message_unlocked(Store::Id docid) const;
|
||||||
Store::IdVec find_duplicates_unlocked(const Store& store,
|
Store::IdVec find_duplicates_unlocked(const Store& store,
|
||||||
const std::string& message_id) const;
|
const std::string& message_id) const;
|
||||||
|
@ -150,7 +159,8 @@ struct Store::Private {
|
||||||
ContactsCache contacts_cache_;
|
ContactsCache contacts_cache_;
|
||||||
std::unique_ptr<Indexer> indexer_;
|
std::unique_ptr<Indexer> indexer_;
|
||||||
|
|
||||||
const std::string root_maildir_;
|
const std::string root_maildir_;
|
||||||
|
const Message::Options message_opts_;
|
||||||
|
|
||||||
size_t transaction_size_{};
|
size_t transaction_size_{};
|
||||||
std::mutex lock_;
|
std::mutex lock_;
|
||||||
|
@ -340,6 +350,11 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
|
||||||
if (auto&& res = msg.set_maildir(mdir.value()); !res)
|
if (auto&& res = msg.set_maildir(mdir.value()); !res)
|
||||||
return Err(res.error());
|
return Err(res.error());
|
||||||
|
|
||||||
|
// we shouldn't mix ngrams/non-ngrams messages.
|
||||||
|
if (any_of(msg.options() & Message::Options::SupportNgrams) !=
|
||||||
|
any_of(message_options() & Message::Options::SupportNgrams))
|
||||||
|
return Err(Error::Code::InvalidArgument, "incompatible message options");
|
||||||
|
|
||||||
/* add contacts from this message to cache; this cache
|
/* add contacts from this message to cache; this cache
|
||||||
* also determines whether those contacts are _personal_, i.e. match
|
* also determines whether those contacts are _personal_, i.e. match
|
||||||
* our personal addresses.
|
* our personal addresses.
|
||||||
|
@ -371,6 +386,16 @@ Store::add_message(Message& msg, bool use_transaction, bool is_new)
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Result<Store::Id>
|
||||||
|
Store::add_message(const std::string& path, bool use_transaction, bool is_new)
|
||||||
|
{
|
||||||
|
if (auto msg{Message::make_from_path(path, priv_->message_opts_)}; !msg)
|
||||||
|
return Err(msg.error());
|
||||||
|
else
|
||||||
|
return add_message(msg.value(), use_transaction, is_new);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
bool
|
bool
|
||||||
Store::remove_message(const std::string& path)
|
Store::remove_message(const std::string& path)
|
||||||
{
|
{
|
||||||
|
@ -649,3 +674,9 @@ Store::maildirs() const
|
||||||
|
|
||||||
return mdirs;
|
return mdirs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Message::Options
|
||||||
|
Store::message_options() const
|
||||||
|
{
|
||||||
|
return priv_->message_opts_;
|
||||||
|
}
|
||||||
|
|
|
@ -207,21 +207,7 @@ public:
|
||||||
Result<Id> add_message(Message& msg, bool use_transaction = false,
|
Result<Id> add_message(Message& msg, bool use_transaction = false,
|
||||||
bool is_new = false);
|
bool is_new = false);
|
||||||
Result<Id> add_message(const std::string& path, bool use_transaction = false,
|
Result<Id> add_message(const std::string& path, bool use_transaction = false,
|
||||||
bool is_new = false) {
|
bool is_new = false);
|
||||||
if (auto msg{Message::make_from_path(path)}; !msg)
|
|
||||||
return Err(msg.error());
|
|
||||||
else
|
|
||||||
return add_message(msg.value(), use_transaction, is_new);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Update a message in the store.
|
|
||||||
*
|
|
||||||
* @param msg a message
|
|
||||||
* @param id the id for this message
|
|
||||||
*
|
|
||||||
* @return Ok() or an error.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove a message from the store. It will _not_ remove the message
|
* Remove a message from the store. It will _not_ remove the message
|
||||||
|
@ -258,7 +244,6 @@ public:
|
||||||
*/
|
*/
|
||||||
Option<Message> find_message(Id id) const;
|
Option<Message> find_message(Id id) const;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the messages for the given ids
|
* Find the messages for the given ids
|
||||||
*
|
*
|
||||||
|
@ -288,7 +273,6 @@ public:
|
||||||
*/
|
*/
|
||||||
bool contains_message(const std::string& path) const;
|
bool contains_message(const std::string& path) const;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options for moving
|
* Options for moving
|
||||||
*
|
*
|
||||||
|
@ -437,6 +421,15 @@ public:
|
||||||
*/
|
*/
|
||||||
std::vector<std::string> maildirs() const;
|
std::vector<std::string> maildirs() const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compatible message-options for this store
|
||||||
|
*
|
||||||
|
* @return message-options.
|
||||||
|
*/
|
||||||
|
Message::Options message_options() const;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _almost_ private
|
* _almost_ private
|
||||||
*/
|
*/
|
||||||
|
@ -474,6 +467,13 @@ private:
|
||||||
MU_ENABLE_BITOPS(Store::Options);
|
MU_ENABLE_BITOPS(Store::Options);
|
||||||
MU_ENABLE_BITOPS(Store::MoveOptions);
|
MU_ENABLE_BITOPS(Store::MoveOptions);
|
||||||
|
|
||||||
|
static inline std::string
|
||||||
|
format_as(const Store& store)
|
||||||
|
{
|
||||||
|
return mu_format("store ({}/{})", format_as(store.xapian_db()),
|
||||||
|
store.root_maildir());
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Mu
|
} // namespace Mu
|
||||||
|
|
||||||
#endif /* __MU_STORE_HH__ */
|
#endif /* __MU_STORE_HH__ */
|
||||||
|
|
|
@ -1,129 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "mu-tokenizer.hh"
|
|
||||||
#include "utils/mu-utils.hh"
|
|
||||||
|
|
||||||
#include <cctype>
|
|
||||||
#include <iostream>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
using namespace Mu;
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_separator(char c)
|
|
||||||
{
|
|
||||||
if (isblank(c))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
const auto seps = std::string("()");
|
|
||||||
return seps.find(c) != std::string::npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static Mu::Token
|
|
||||||
op_or_value(size_t pos, const std::string& val)
|
|
||||||
{
|
|
||||||
auto s = val;
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
|
|
||||||
if (s == "and")
|
|
||||||
return Token{pos, Token::Type::And, val};
|
|
||||||
else if (s == "or")
|
|
||||||
return Token{pos, Token::Type::Or, val};
|
|
||||||
else if (s == "xor")
|
|
||||||
return Token{pos, Token::Type::Xor, val};
|
|
||||||
else if (s == "not")
|
|
||||||
return Token{pos, Token::Type::Not, val};
|
|
||||||
else
|
|
||||||
return Token{pos, Token::Type::Data, val};
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
unread_char(std::string& food, char kar, size_t& pos)
|
|
||||||
{
|
|
||||||
food = kar + food;
|
|
||||||
--pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static Mu::Token
|
|
||||||
eat_token(std::string& food, size_t& pos)
|
|
||||||
{
|
|
||||||
bool quoted{};
|
|
||||||
bool escaped{};
|
|
||||||
std::string value{};
|
|
||||||
|
|
||||||
while (!food.empty()) {
|
|
||||||
const auto kar = food[0];
|
|
||||||
food.erase(0, 1);
|
|
||||||
++pos;
|
|
||||||
|
|
||||||
if (kar == '\\') {
|
|
||||||
escaped = !escaped;
|
|
||||||
if (escaped)
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kar == '"') {
|
|
||||||
if (!escaped && quoted)
|
|
||||||
return Token{pos, Token::Type::Data, value};
|
|
||||||
else {
|
|
||||||
quoted = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!quoted && !escaped && is_separator(kar)) {
|
|
||||||
if (!value.empty() && kar != ':') {
|
|
||||||
unread_char(food, kar, pos);
|
|
||||||
return op_or_value(pos, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (quoted || isblank(kar))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
switch (kar) {
|
|
||||||
case '(': return {pos, Token::Type::Open, "("};
|
|
||||||
case ')': return {pos, Token::Type::Close, ")"};
|
|
||||||
default: break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
value += kar;
|
|
||||||
escaped = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {pos, Token::Type::Data, value};
|
|
||||||
}
|
|
||||||
|
|
||||||
Mu::Tokens
|
|
||||||
Mu::tokenize(const std::string& s)
|
|
||||||
{
|
|
||||||
Tokens tokens{};
|
|
||||||
|
|
||||||
std::string food = utf8_clean(s);
|
|
||||||
size_t pos{0};
|
|
||||||
|
|
||||||
if (s.empty())
|
|
||||||
return {};
|
|
||||||
|
|
||||||
while (!food.empty())
|
|
||||||
tokens.emplace_back(eat_token(food, pos));
|
|
||||||
|
|
||||||
return tokens;
|
|
||||||
}
|
|
|
@ -1,139 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __TOKENIZER_HH__
|
|
||||||
#define __TOKENIZER_HH__
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <deque>
|
|
||||||
#include <ostream>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
// A simple tokenizer, which turns a string into a deque of tokens
|
|
||||||
//
|
|
||||||
// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not'
|
|
||||||
//
|
|
||||||
// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings
|
|
||||||
// when we're creating the parse tree.
|
|
||||||
//
|
|
||||||
// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a
|
|
||||||
// bit more context to resolve ambiguities.
|
|
||||||
|
|
||||||
namespace Mu {
|
|
||||||
|
|
||||||
// A token
|
|
||||||
struct Token {
|
|
||||||
enum class Type {
|
|
||||||
Data, /**< e .g., banana or date:..456 */
|
|
||||||
|
|
||||||
// Brackets
|
|
||||||
Open, /**< ( */
|
|
||||||
Close, /**< ) */
|
|
||||||
|
|
||||||
// Unops
|
|
||||||
Not, /**< logical not*/
|
|
||||||
|
|
||||||
// Binops
|
|
||||||
And, /**< logical and */
|
|
||||||
Or, /**< logical not */
|
|
||||||
Xor, /**< logical xor */
|
|
||||||
|
|
||||||
Empty, /**< nothing */
|
|
||||||
};
|
|
||||||
|
|
||||||
size_t pos{}; /**< position in string */
|
|
||||||
Type type{}; /**< token type */
|
|
||||||
const std::string str{}; /**< data for this token */
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator==
|
|
||||||
*
|
|
||||||
* @param rhs right-hand side
|
|
||||||
*
|
|
||||||
* @return true if rhs is equal to this; false otherwise
|
|
||||||
*/
|
|
||||||
bool operator==(const Token& rhs) const
|
|
||||||
{
|
|
||||||
return pos == rhs.pos && type == rhs.type && str == rhs.str;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator<<
|
|
||||||
*
|
|
||||||
* @param os an output stream
|
|
||||||
* @param t a token type
|
|
||||||
*
|
|
||||||
* @return the updated output stream
|
|
||||||
*/
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, Token::Type t)
|
|
||||||
{
|
|
||||||
switch (t) {
|
|
||||||
case Token::Type::Data: os << "<data>"; break;
|
|
||||||
|
|
||||||
case Token::Type::Open: os << "<open>"; break;
|
|
||||||
case Token::Type::Close: os << "<close>"; break;
|
|
||||||
|
|
||||||
case Token::Type::Not: os << "<not>"; break;
|
|
||||||
case Token::Type::And: os << "<and>"; break;
|
|
||||||
case Token::Type::Or: os << "<or>"; break;
|
|
||||||
case Token::Type::Xor: os << "<xor>"; break;
|
|
||||||
case Token::Type::Empty: os << "<empty>"; break;
|
|
||||||
default: // can't happen, but pacify compiler
|
|
||||||
throw std::runtime_error("<<bug>>");
|
|
||||||
}
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator<<
|
|
||||||
*
|
|
||||||
* @param os an output stream
|
|
||||||
* @param t a token
|
|
||||||
*
|
|
||||||
* @return the updated output stream
|
|
||||||
*/
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, const Token& t)
|
|
||||||
{
|
|
||||||
os << t.pos << ": " << t.type;
|
|
||||||
|
|
||||||
if (!t.str.empty())
|
|
||||||
os << " [" << t.str << "]";
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors
|
|
||||||
* such a missing end-".
|
|
||||||
*
|
|
||||||
* @param s a string
|
|
||||||
*
|
|
||||||
* @return a deque of tokens
|
|
||||||
*/
|
|
||||||
using Tokens = std::deque<Token>;
|
|
||||||
Tokens tokenize(const std::string& s);
|
|
||||||
|
|
||||||
} // namespace Mu
|
|
||||||
|
|
||||||
#endif /* __TOKENIZER_HH__ */
|
|
162
lib/mu-tree.hh
162
lib/mu-tree.hh
|
@ -1,162 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef TREE_HH__
|
|
||||||
#define TREE_HH__
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <iostream>
|
|
||||||
#include <message/mu-fields.hh>
|
|
||||||
|
|
||||||
#include <utils/mu-option.hh>
|
|
||||||
#include <utils/mu-error.hh>
|
|
||||||
|
|
||||||
namespace Mu {
|
|
||||||
|
|
||||||
struct FieldValue {
|
|
||||||
FieldValue(Field::Id idarg, const std::string valarg):
|
|
||||||
field_id{idarg}, val1{valarg} {}
|
|
||||||
FieldValue(Field::Id idarg, const std::string valarg1, const std::string valarg2):
|
|
||||||
field_id{idarg}, val1{valarg1}, val2{valarg2} {}
|
|
||||||
|
|
||||||
const Field& field() const { return field_from_id(field_id); }
|
|
||||||
const std::string& value() const { return val1; }
|
|
||||||
const std::pair<std::string, std::string> range() const { return { val1, val2 }; }
|
|
||||||
|
|
||||||
const Field::Id field_id;
|
|
||||||
const std::string val1;
|
|
||||||
const std::string val2;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* operator<<
|
|
||||||
*
|
|
||||||
* @param os an output stream
|
|
||||||
* @param fval a field value.
|
|
||||||
*
|
|
||||||
* @return the updated output stream
|
|
||||||
*/
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, const FieldValue& fval)
|
|
||||||
{
|
|
||||||
os << ' ' << quote(std::string{fval.field().name});
|
|
||||||
|
|
||||||
if (fval.field().is_range())
|
|
||||||
os << ' ' << quote(fval.range().first)
|
|
||||||
<< ' ' << quote(fval.range().second);
|
|
||||||
else
|
|
||||||
os << ' ' << quote(fval.value());
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
// A node in the parse tree
|
|
||||||
struct Node {
|
|
||||||
enum class Type {
|
|
||||||
Empty, // only for empty trees
|
|
||||||
OpAnd,
|
|
||||||
OpOr,
|
|
||||||
OpXor,
|
|
||||||
OpAndNot,
|
|
||||||
OpNot,
|
|
||||||
Value,
|
|
||||||
ValueAtomic,
|
|
||||||
Range,
|
|
||||||
Invalid
|
|
||||||
};
|
|
||||||
|
|
||||||
Node(Type _type, FieldValue&& fval) : type{_type}, field_val{std::move(fval)} {}
|
|
||||||
Node(Type _type) : type{_type} {}
|
|
||||||
Node(Node&& rhs) = default;
|
|
||||||
|
|
||||||
Type type;
|
|
||||||
Option<FieldValue> field_val;
|
|
||||||
|
|
||||||
static constexpr std::string_view type_name(Type t) {
|
|
||||||
switch (t) {
|
|
||||||
case Type::Empty:
|
|
||||||
return "";
|
|
||||||
case Type::OpAnd:
|
|
||||||
return "and";
|
|
||||||
case Type::OpOr:
|
|
||||||
return "or";
|
|
||||||
case Type::OpXor:
|
|
||||||
return "xor";
|
|
||||||
case Type::OpAndNot:
|
|
||||||
return "andnot";
|
|
||||||
case Type::OpNot:
|
|
||||||
return "not";
|
|
||||||
case Type::Value:
|
|
||||||
return "value";
|
|
||||||
case Type::ValueAtomic:
|
|
||||||
return "value_atomic";
|
|
||||||
case Type::Range:
|
|
||||||
return "range";
|
|
||||||
case Type::Invalid:
|
|
||||||
return "<invalid>";
|
|
||||||
default:
|
|
||||||
return "<error>";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static constexpr bool is_binop(Type t) {
|
|
||||||
return t == Type::OpAnd || t == Type::OpAndNot || t == Type::OpOr ||
|
|
||||||
t == Type::OpXor;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, const Node& t)
|
|
||||||
{
|
|
||||||
os << Node::type_name(t.type);
|
|
||||||
if (t.field_val)
|
|
||||||
os << t.field_val.value();
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Tree {
|
|
||||||
Tree(Node&& _node) : node(std::move(_node)) {}
|
|
||||||
Tree(Tree&& rhs) = default;
|
|
||||||
|
|
||||||
void add_child(Tree&& child) { children.emplace_back(std::move(child)); }
|
|
||||||
bool empty() const { return node.type == Node::Type::Empty; }
|
|
||||||
|
|
||||||
Node node;
|
|
||||||
std::vector<Tree> children;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline std::ostream&
|
|
||||||
operator<<(std::ostream& os, const Tree& tree)
|
|
||||||
{
|
|
||||||
os << '(' << tree.node;
|
|
||||||
for (const auto& subtree : tree.children)
|
|
||||||
os << subtree;
|
|
||||||
os << ')';
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Mu
|
|
||||||
|
|
||||||
#endif /* TREE_HH__ */
|
|
|
@ -101,4 +101,6 @@ XapianDb::XapianDb(const std::string& db_path, Flavor flavor) :
|
||||||
|
|
||||||
if (flavor == Flavor::CreateOverwrite)
|
if (flavor == Flavor::CreateOverwrite)
|
||||||
set_timestamp(MetadataIface::created_key);
|
set_timestamp(MetadataIface::created_key);
|
||||||
|
|
||||||
|
mu_debug("created {} / {}", flavor, *this);
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,6 +192,16 @@ public:
|
||||||
*/
|
*/
|
||||||
const std::string& path() const;
|
const std::string& path() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a description of the Xapian database
|
||||||
|
*
|
||||||
|
* @return description
|
||||||
|
*/
|
||||||
|
const std::string description() const {
|
||||||
|
return db().get_description();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of documents (messages) in the database
|
* Get the number of documents (messages) in the database
|
||||||
*
|
*
|
||||||
|
@ -399,6 +409,27 @@ private:
|
||||||
DbType db_;
|
DbType db_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
constexpr std::string_view
|
||||||
|
format_as(XapianDb::Flavor flavor)
|
||||||
|
{
|
||||||
|
switch(flavor) {
|
||||||
|
case XapianDb::Flavor::CreateOverwrite:
|
||||||
|
return "create-overwrite";
|
||||||
|
case XapianDb::Flavor::Open:
|
||||||
|
return "open";
|
||||||
|
case XapianDb::Flavor::ReadOnly:
|
||||||
|
return "read-only";
|
||||||
|
default:
|
||||||
|
return "??";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline std::string
|
||||||
|
format_as(const XapianDb& db)
|
||||||
|
{
|
||||||
|
return mu_format("{} @ {}", db.description(), db.path());
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Mu
|
} // namespace Mu
|
||||||
|
|
||||||
#endif /* MU_XAPIAN_DB_HH__ */
|
#endif /* MU_XAPIAN_DB_HH__ */
|
||||||
|
|
139
lib/mu-xapian.cc
139
lib/mu-xapian.cc
|
@ -1,139 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <config.h>
|
|
||||||
|
|
||||||
#include <xapian.h>
|
|
||||||
#include "mu-xapian.hh"
|
|
||||||
#include <utils/mu-error.hh>
|
|
||||||
|
|
||||||
using namespace Mu;
|
|
||||||
|
|
||||||
static Xapian::Query
|
|
||||||
xapian_query_op(const Mu::Tree& tree)
|
|
||||||
{
|
|
||||||
if (tree.node.type == Node::Type::OpNot) { // OpNot x ::= <all> AND NOT x
|
|
||||||
if (tree.children.size() != 1)
|
|
||||||
throw std::runtime_error("invalid # of children");
|
|
||||||
return Xapian::Query(Xapian::Query::OP_AND_NOT,
|
|
||||||
Xapian::Query::MatchAll,
|
|
||||||
xapian_query(tree.children.front()));
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto op = std::invoke([](Node::Type ntype) {
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wswitch-enum"
|
|
||||||
switch (ntype) {
|
|
||||||
case Node::Type::OpAnd:
|
|
||||||
return Xapian::Query::OP_AND;
|
|
||||||
case Node::Type::OpOr:
|
|
||||||
return Xapian::Query::OP_OR;
|
|
||||||
case Node::Type::OpXor:
|
|
||||||
return Xapian::Query::OP_XOR;
|
|
||||||
case Node::Type::OpAndNot:
|
|
||||||
return Xapian::Query::OP_AND_NOT;
|
|
||||||
case Node::Type::OpNot:
|
|
||||||
default:
|
|
||||||
throw Mu::Error(Error::Code::Internal, "invalid op"); // bug
|
|
||||||
}
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
}, tree.node.type);
|
|
||||||
|
|
||||||
std::vector<Xapian::Query> childvec;
|
|
||||||
for (const auto& subtree : tree.children)
|
|
||||||
childvec.emplace_back(xapian_query(subtree));
|
|
||||||
|
|
||||||
return Xapian::Query(op, childvec.begin(), childvec.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xapian::Query
|
|
||||||
make_query(const FieldValue& fval, bool maybe_wildcard)
|
|
||||||
{
|
|
||||||
const auto vlen{fval.value().length()};
|
|
||||||
if (!maybe_wildcard || vlen <= 1 || fval.value()[vlen - 1] != '*')
|
|
||||||
return Xapian::Query(fval.field().xapian_term(fval.value()));
|
|
||||||
else
|
|
||||||
return Xapian::Query(Xapian::Query::OP_WILDCARD,
|
|
||||||
fval.field().xapian_term(fval.value().substr(0, vlen - 1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xapian::Query
|
|
||||||
xapian_query_value(const Mu::Tree& tree)
|
|
||||||
{
|
|
||||||
// indexable field implies it can be use with a phrase search.
|
|
||||||
const auto& field_val{tree.node.field_val.value()};
|
|
||||||
if (!field_val.field().is_indexable_term()) { //
|
|
||||||
/* not an indexable field; no extra magic needed*/
|
|
||||||
return make_query(field_val, true /*maybe-wildcard*/);
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool is_atomic = tree.node.type == Node::Type::ValueAtomic;
|
|
||||||
|
|
||||||
const auto parts{split(field_val.value(), " ")};
|
|
||||||
if (parts.empty())
|
|
||||||
return Xapian::Query::MatchNothing; // shouldn't happen
|
|
||||||
else if (parts.size() == 1 && !is_atomic)
|
|
||||||
return make_query(field_val, true /*maybe-wildcard*/);
|
|
||||||
else if (is_atomic)
|
|
||||||
return make_query(field_val, false /*maybe-wildcard*/);
|
|
||||||
|
|
||||||
std::vector<Xapian::Query> phvec;
|
|
||||||
for (const auto& p : parts) {
|
|
||||||
FieldValue fv{field_val.field_id, p};
|
|
||||||
phvec.emplace_back(make_query(fv, false /*no wildcards*/));
|
|
||||||
}
|
|
||||||
|
|
||||||
return Xapian::Query(Xapian::Query::OP_PHRASE, phvec.begin(), phvec.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xapian::Query
|
|
||||||
xapian_query_range(const Mu::Tree& tree)
|
|
||||||
{
|
|
||||||
const auto& field_val{tree.node.field_val.value()};
|
|
||||||
|
|
||||||
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
|
|
||||||
field_val.field().value_no(),
|
|
||||||
field_val.range().first,
|
|
||||||
field_val.range().second);
|
|
||||||
}
|
|
||||||
|
|
||||||
Xapian::Query
|
|
||||||
Mu::xapian_query(const Mu::Tree& tree)
|
|
||||||
{
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wswitch-enum"
|
|
||||||
switch (tree.node.type) {
|
|
||||||
case Node::Type::Empty:
|
|
||||||
return Xapian::Query();
|
|
||||||
case Node::Type::OpNot:
|
|
||||||
case Node::Type::OpAnd:
|
|
||||||
case Node::Type::OpOr:
|
|
||||||
case Node::Type::OpXor:
|
|
||||||
case Node::Type::OpAndNot:
|
|
||||||
return xapian_query_op(tree);
|
|
||||||
case Node::Type::Value:
|
|
||||||
case Node::Type::ValueAtomic:
|
|
||||||
return xapian_query_value(tree);
|
|
||||||
case Node::Type::Range:
|
|
||||||
return xapian_query_range(tree);
|
|
||||||
default:
|
|
||||||
throw Mu::Error(Error::Code::Internal, "invalid query"); // bug
|
|
||||||
}
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
}
|
|
|
@ -1,39 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MU_XAPIAN_HH__
|
|
||||||
#define MU_XAPIAN_HH__
|
|
||||||
|
|
||||||
#include <xapian.h>
|
|
||||||
#include <mu-parser.hh>
|
|
||||||
|
|
||||||
namespace Mu {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Transform a parse-tree into a Xapian query object
|
|
||||||
*
|
|
||||||
* @param tree a parse tree
|
|
||||||
*
|
|
||||||
* @return a Xapian query object
|
|
||||||
*/
|
|
||||||
Xapian::Query xapian_query(const Mu::Tree& tree);
|
|
||||||
|
|
||||||
} // namespace Mu
|
|
||||||
|
|
||||||
#endif /* MU_XAPIAN_H__ */
|
|
|
@ -19,42 +19,30 @@
|
||||||
#
|
#
|
||||||
test('test-maildir',
|
test('test-maildir',
|
||||||
executable('test-maildir',
|
executable('test-maildir',
|
||||||
'test-mu-maildir.cc',
|
'test-mu-maildir.cc',
|
||||||
install: false,
|
install: false,
|
||||||
dependencies: [glib_dep, lib_mu_dep]))
|
dependencies: [glib_dep, lib_mu_dep]))
|
||||||
test('test-msg',
|
test('test-msg',
|
||||||
executable('test-msg',
|
executable('test-msg',
|
||||||
'test-mu-msg.cc',
|
'test-mu-msg.cc',
|
||||||
install: false,
|
install: false,
|
||||||
dependencies: [glib_dep, lib_mu_dep]))
|
dependencies: [glib_dep, lib_mu_dep]))
|
||||||
test('test-store',
|
test('test-store',
|
||||||
executable('test-store',
|
executable('test-store',
|
||||||
'test-mu-store.cc',
|
'test-mu-store.cc',
|
||||||
install: false,
|
install: false,
|
||||||
dependencies: [glib_dep, lib_mu_dep]))
|
dependencies: [glib_dep, lib_mu_dep]))
|
||||||
test('test-query',
|
test('test-query',
|
||||||
executable('test-query',
|
executable('test-query',
|
||||||
'test-query.cc',
|
'test-query.cc',
|
||||||
install: false,
|
install: false,
|
||||||
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
|
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
|
||||||
|
|
||||||
test('test-tokenizer',
|
|
||||||
executable('test-tokenizer',
|
|
||||||
'test-tokenizer.cc',
|
|
||||||
install: false,
|
|
||||||
dependencies: [glib_dep, lib_mu_dep]))
|
|
||||||
|
|
||||||
test('test-parser',
|
|
||||||
executable('test-parser',
|
|
||||||
'test-parser.cc',
|
|
||||||
install: false,
|
|
||||||
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
|
|
||||||
|
|
||||||
test('test-store-query',
|
test('test-store-query',
|
||||||
executable('test-store-query',
|
executable('test-store-query',
|
||||||
'test-mu-store-query.cc',
|
'test-mu-store-query.cc',
|
||||||
install: false,
|
install: false,
|
||||||
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
|
dependencies: [glib_dep, gmime_dep, lib_mu_dep]))
|
||||||
#
|
#
|
||||||
# benchmarks
|
# benchmarks
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||||
**
|
**
|
||||||
** This program is free software; you can redistribute it and/or modify it
|
** This program is free software; you can redistribute it and/or modify it
|
||||||
** under the terms of the GNU General Public License as published by the
|
** under the terms of the GNU General Public License as published by the
|
||||||
|
@ -29,9 +29,12 @@
|
||||||
#include <mu-store.hh>
|
#include <mu-store.hh>
|
||||||
#include <mu-maildir.hh>
|
#include <mu-maildir.hh>
|
||||||
#include <utils/mu-utils.hh>
|
#include <utils/mu-utils.hh>
|
||||||
|
#include <utils/mu-utils-file.hh>
|
||||||
#include <utils/mu-test-utils.hh>
|
#include <utils/mu-test-utils.hh>
|
||||||
#include <message/mu-message.hh>
|
#include <message/mu-message.hh>
|
||||||
|
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
|
|
||||||
using namespace Mu;
|
using namespace Mu;
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,7 +43,7 @@ using TestMap = std::unordered_map<std::string, std::string>;
|
||||||
|
|
||||||
static Store
|
static Store
|
||||||
make_test_store(const std::string& test_path, const TestMap& test_map,
|
make_test_store(const std::string& test_path, const TestMap& test_map,
|
||||||
const StringVec &personal_addresses)
|
Option<const Config&> conf={})
|
||||||
{
|
{
|
||||||
std::string maildir = test_path + "/Maildir/";
|
std::string maildir = test_path + "/Maildir/";
|
||||||
// note the trailing '/'
|
// note the trailing '/'
|
||||||
|
@ -49,12 +52,11 @@ make_test_store(const std::string& test_path, const TestMap& test_map,
|
||||||
/* write messages to disk */
|
/* write messages to disk */
|
||||||
for (auto&& item: test_map) {
|
for (auto&& item: test_map) {
|
||||||
|
|
||||||
const auto msgpath = maildir + "/" + item.first;
|
|
||||||
|
|
||||||
/* create the directory for the message */
|
/* create the directory for the message */
|
||||||
|
const auto msgpath{join_paths(maildir, item.first)};
|
||||||
auto dir = to_string_gchar(g_path_get_dirname(msgpath.c_str()));
|
auto dir = to_string_gchar(g_path_get_dirname(msgpath.c_str()));
|
||||||
if (g_test_verbose())
|
if (g_test_verbose())
|
||||||
g_message("create message dir %s", dir.c_str());
|
mu_message("create maildir {}", dir.c_str());
|
||||||
|
|
||||||
g_assert_cmpuint(g_mkdir_with_parents(dir.c_str(), 0700), ==, 0);
|
g_assert_cmpuint(g_mkdir_with_parents(dir.c_str(), 0700), ==, 0);
|
||||||
|
|
||||||
|
@ -65,11 +67,6 @@ make_test_store(const std::string& test_path, const TestMap& test_map,
|
||||||
stream.close();
|
stream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* make the store */
|
|
||||||
MemDb mdb;
|
|
||||||
Config conf{mdb};
|
|
||||||
conf.set<Config::Id::PersonalAddresses>(personal_addresses);
|
|
||||||
|
|
||||||
auto store = Store::make_new(test_path, maildir, conf);
|
auto store = Store::make_new(test_path, maildir, conf);
|
||||||
assert_valid_result(store);
|
assert_valid_result(store);
|
||||||
|
|
||||||
|
@ -90,7 +87,6 @@ make_test_store(const std::string& test_path, const TestMap& test_map,
|
||||||
return std::move(store.value());
|
return std::move(store.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_simple()
|
test_simple()
|
||||||
{
|
{
|
||||||
|
@ -161,7 +157,9 @@ I said: "Aujourd'hui!"
|
||||||
}) {
|
}) {
|
||||||
|
|
||||||
if (g_test_verbose())
|
if (g_test_verbose())
|
||||||
g_message("query: '%s'", expr);
|
mu_message("query: '{}'\n", expr,
|
||||||
|
make_xapian_query(store, expr)->get_description());
|
||||||
|
|
||||||
auto qr = store.run_query(expr);
|
auto qr = store.run_query(expr);
|
||||||
assert_valid_result(qr);
|
assert_valid_result(qr);
|
||||||
g_assert_false(qr->empty());
|
g_assert_false(qr->empty());
|
||||||
|
@ -644,7 +642,8 @@ test_term_split()
|
||||||
// Note the fancy quote in "foo’s bar"
|
// Note the fancy quote in "foo’s bar"
|
||||||
const TestMap test_msgs = {{
|
const TestMap test_msgs = {{
|
||||||
"inbox/new/msg",
|
"inbox/new/msg",
|
||||||
{ R"(Message-Id: <abcde@foo.bar>
|
{
|
||||||
|
R"(Message-Id: <abcde@foo.bar>
|
||||||
From: "Foo Example" <bar@example.com>
|
From: "Foo Example" <bar@example.com>
|
||||||
Date: Wed, 26 Oct 2022 11:01:54 -0700
|
Date: Wed, 26 Oct 2022 11:01:54 -0700
|
||||||
To: example@example.com
|
To: example@example.com
|
||||||
|
@ -657,17 +656,57 @@ Boo!
|
||||||
TempDir tdir;
|
TempDir tdir;
|
||||||
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
||||||
/* true: match; false: no match */
|
/* true: match; false: no match */
|
||||||
const auto cases = std::array<std::pair<const char*, bool>, 6>{{
|
const auto cases = std::array<std::pair<const char*, bool>, 7>{{
|
||||||
{"subject:foo's", true},
|
{"subject:foo's", true},
|
||||||
{"subject:foo*", true},
|
{"subject:foo*", true},
|
||||||
{"subject:/foo/", true},
|
{"subject:/foo/", true},
|
||||||
{"subject:/foo’s/", true}, /* <-- breaks before PR #2365 */
|
{"subject:/foo’s/", true}, /* <-- breaks before PR #2365 */
|
||||||
{"subject:/foo.*bar/", true}, /* <-- breaks before PR #2365 */
|
{"subject:/foo.*bar/", true}, /* <-- breaks before PR #2365 */
|
||||||
{"subject:/foo’s bar/", false}, /* <-- no matching yet */
|
{"subject:/foo’s bar/", false}, /* <-- no matching, needs quoting */
|
||||||
|
{"subject:\"/foo’s bar/\"", true}, /* <-- this works, quote the regex */
|
||||||
}};
|
}};
|
||||||
|
|
||||||
for (auto&& test: cases) {
|
for (auto&& test: cases) {
|
||||||
g_debug("query: %s", test.first);
|
mu_debug("query: '{}'", test.first);
|
||||||
|
auto qr = store.run_query(test.first);
|
||||||
|
assert_valid_result(qr);
|
||||||
|
if (test.second)
|
||||||
|
g_assert_cmpuint(qr->size(), ==, 1);
|
||||||
|
else
|
||||||
|
g_assert_true(qr->empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_subject_kata_containers()
|
||||||
|
{
|
||||||
|
g_test_bug("2167");
|
||||||
|
|
||||||
|
// Note the fancy quote in "foo’s bar"
|
||||||
|
const TestMap test_msgs = {{
|
||||||
|
"inbox/new/msg",
|
||||||
|
{
|
||||||
|
R"(Message-Id: <abcde@foo.bar>
|
||||||
|
From: "Foo Example" <bar@example.com>
|
||||||
|
Date: Wed, 26 Oct 2022 11:01:54 -0700
|
||||||
|
To: example@example.com
|
||||||
|
Subject: kata-containers
|
||||||
|
|
||||||
|
Boo!
|
||||||
|
)"},
|
||||||
|
}};
|
||||||
|
|
||||||
|
TempDir tdir;
|
||||||
|
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
||||||
|
/* true: match; false: no match */
|
||||||
|
const auto cases = std::array<std::pair<const char*, bool>, 3>{{
|
||||||
|
{"subject:kata", true},
|
||||||
|
{"subject:containers", true},
|
||||||
|
{"subject:kata-containers", true}
|
||||||
|
}};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
mu_debug("query: '{}'", test.first);
|
||||||
auto qr = store.run_query(test.first);
|
auto qr = store.run_query(test.first);
|
||||||
assert_valid_result(qr);
|
assert_valid_result(qr);
|
||||||
if (test.second)
|
if (test.second)
|
||||||
|
@ -776,15 +815,75 @@ html
|
||||||
assert_valid_result(qr);
|
assert_valid_result(qr);
|
||||||
g_assert_cmpuint(qr->size(), ==, 1);
|
g_assert_cmpuint(qr->size(), ==, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_cjk()
|
||||||
|
{
|
||||||
|
g_test_bug("2167");
|
||||||
|
|
||||||
|
// Note the fancy quote in "foo’s bar"
|
||||||
|
const TestMap test_msgs = {{
|
||||||
|
"inbox/new/msg",
|
||||||
|
{
|
||||||
|
R"(From: "Bob" <bob@builder.com>
|
||||||
|
Subject: スポンサーシップ募集
|
||||||
|
To: "Chase" <chase@ppatrol.org>
|
||||||
|
Message-Id: 112342343e9dfo.fsf@builder.com
|
||||||
|
|
||||||
|
中文
|
||||||
|
|
||||||
|
https://trac.xapian.org/ticket/719
|
||||||
|
|
||||||
|
サーバがダウンしました
|
||||||
|
)"}}};
|
||||||
|
|
||||||
|
MemDb mdb;
|
||||||
|
Config conf{mdb};
|
||||||
|
conf.set<Config::Id::SupportNgrams>(true);
|
||||||
|
|
||||||
|
TempDir tdir;
|
||||||
|
auto store{make_test_store(tdir.path(), test_msgs, conf)};
|
||||||
|
store.commit();
|
||||||
|
|
||||||
|
/* true: match; false: no match */
|
||||||
|
const auto cases = std::vector<std::pair<std::string_view, bool>>{{
|
||||||
|
{"body:中文", true},
|
||||||
|
{"body:中", true},
|
||||||
|
{"body:文", true},
|
||||||
|
{"body:し", true},
|
||||||
|
{"body:サー", true},
|
||||||
|
{"body:サーバがダウンしました", true}, // fail
|
||||||
|
{"中文", true},
|
||||||
|
{"中", true},
|
||||||
|
{"文", true},
|
||||||
|
{"subject:スポン", true },
|
||||||
|
{"subject:スポンサーシップ募集", true },
|
||||||
|
{"subject:シップ", true }, // XXX should match
|
||||||
|
{"サーバがダウンしました", true}, // okay
|
||||||
|
{"body:サーバがダウンしました", true}, // okay
|
||||||
|
{"subject:スポンサーシップ募集", true}, // okay
|
||||||
|
{"subject:シップx", true }, // XXX should match
|
||||||
|
}};
|
||||||
|
|
||||||
|
for (auto&& test: cases) {
|
||||||
|
auto qr = store.run_query(std::string{test.first});
|
||||||
|
assert_valid_result(qr);
|
||||||
|
if (test.second)
|
||||||
|
g_assert_cmpuint(qr->size(), ==, 1);
|
||||||
|
else
|
||||||
|
g_assert_true(qr->empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char* argv[])
|
main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
mu_test_init(&argc, &argv);
|
mu_test_init(&argc, &argv);
|
||||||
|
|
||||||
|
//_test_add_func("/store/query/cjk", test_cjk);
|
||||||
|
|
||||||
g_test_add_func("/store/query/simple", test_simple);
|
g_test_add_func("/store/query/simple", test_simple);
|
||||||
g_test_add_func("/store/query/spam-address-components",
|
g_test_add_func("/store/query/spam-address-components",
|
||||||
test_spam_address_components);
|
test_spam_address_components);
|
||||||
|
@ -800,9 +899,15 @@ main(int argc, char* argv[])
|
||||||
test_duplicate_refresh_rename);
|
test_duplicate_refresh_rename);
|
||||||
g_test_add_func("/store/query/term-split",
|
g_test_add_func("/store/query/term-split",
|
||||||
test_term_split);
|
test_term_split);
|
||||||
|
g_test_add_func("/store/query/kata_containers",
|
||||||
|
test_subject_kata_containers);
|
||||||
g_test_add_func("/store/query/related-dup-threaded",
|
g_test_add_func("/store/query/related-dup-threaded",
|
||||||
test_related_dup_threaded);
|
test_related_dup_threaded);
|
||||||
g_test_add_func("/store/query/html", test_html);
|
g_test_add_func("/store/query/html",
|
||||||
|
test_html);
|
||||||
|
|
||||||
|
g_test_add_func("/store/query/cjk-once-more", test_cjk);
|
||||||
|
|
||||||
|
|
||||||
return g_test_run();
|
return g_test_run();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,139 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <glib.h>
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "utils/mu-test-utils.hh"
|
|
||||||
|
|
||||||
#include "mu-parser.hh"
|
|
||||||
#include "utils/mu-result.hh"
|
|
||||||
#include "utils/mu-utils.hh"
|
|
||||||
using namespace Mu;
|
|
||||||
|
|
||||||
struct Case {
|
|
||||||
const std::string expr;
|
|
||||||
const std::string expected;
|
|
||||||
WarningVec warnings{};
|
|
||||||
};
|
|
||||||
|
|
||||||
using CaseVec = std::vector<Case>;
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_cases(const CaseVec& cases)
|
|
||||||
{
|
|
||||||
char* tmpdir = test_mu_common_get_random_tmpdir();
|
|
||||||
g_assert(tmpdir);
|
|
||||||
auto dummy_store{Store::make_new(tmpdir, "/tmp")};
|
|
||||||
assert_valid_result(dummy_store);
|
|
||||||
|
|
||||||
g_free(tmpdir);
|
|
||||||
|
|
||||||
Parser parser{*dummy_store, Parser::Flags::UnitTest};
|
|
||||||
|
|
||||||
for (const auto& casus : cases) {
|
|
||||||
WarningVec warnings;
|
|
||||||
const auto tree = parser.parse(casus.expr, warnings);
|
|
||||||
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << tree;
|
|
||||||
|
|
||||||
if (g_test_verbose()) {
|
|
||||||
std::cout << "\n";
|
|
||||||
std::cout << casus.expr << std::endl;
|
|
||||||
std::cout << "exp:" << casus.expected << std::endl;
|
|
||||||
std::cout << "got:" << ss.str() << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_equal(casus.expected, ss.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_basic()
|
|
||||||
{
|
|
||||||
CaseVec cases = {
|
|
||||||
//{ "", R"#((atom :value ""))#"},
|
|
||||||
{
|
|
||||||
"foo",
|
|
||||||
R"#((value "message-id" "foo"))#",
|
|
||||||
},
|
|
||||||
{"foo or bar", R"#((or(value "message-id" "foo")(value "message-id" "bar")))#"},
|
|
||||||
{"foo and bar", R"#((and(value "message-id" "foo")(value "message-id" "bar")))#"},
|
|
||||||
};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_complex()
|
|
||||||
{
|
|
||||||
CaseVec cases = {
|
|
||||||
{"foo and bar or cuux",
|
|
||||||
R"#((or(and(value "message-id" "foo")(value "message-id" "bar")))#" +
|
|
||||||
std::string(R"#((value "message-id" "cuux")))#")},
|
|
||||||
{"a and not b", R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"},
|
|
||||||
{"a and b and c",
|
|
||||||
R"#((and(value "message-id" "a")(and(value "message-id" "b")(value "message-id" "c"))))#"},
|
|
||||||
{"(a or b) and c",
|
|
||||||
R"#((and(or(value "message-id" "a")(value "message-id" "b"))(value "message-id" "c")))#"},
|
|
||||||
{"a b", // implicit and
|
|
||||||
R"#((and(value "message-id" "a")(value "message-id" "b")))#"},
|
|
||||||
{"a not b", // implicit and not
|
|
||||||
R"#((and(value "message-id" "a")(not(value "message-id" "b"))))#"},
|
|
||||||
{"not b", // implicit and not
|
|
||||||
R"#((not(value "message-id" "b")))#"}};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
G_GNUC_UNUSED static void
|
|
||||||
test_range()
|
|
||||||
{
|
|
||||||
CaseVec cases = {
|
|
||||||
{"range:a..b", // implicit and
|
|
||||||
R"#((range "range" "a" "b"))#"},
|
|
||||||
};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_flatten()
|
|
||||||
{
|
|
||||||
CaseVec cases = {{" Mötørhęåđ", R"#((value "message-id" "motorhead"))#"}};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
main(int argc, char* argv[])
|
|
||||||
{
|
|
||||||
g_test_init(&argc, &argv, NULL);
|
|
||||||
|
|
||||||
g_test_add_func("/parser/basic", test_basic);
|
|
||||||
g_test_add_func("/parser/complex", test_complex);
|
|
||||||
// g_test_add_func ("/parser/range", test_range);
|
|
||||||
g_test_add_func("/parser/flatten", test_flatten);
|
|
||||||
|
|
||||||
return g_test_run();
|
|
||||||
}
|
|
|
@ -1,147 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <glib.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "mu-tokenizer.hh"
|
|
||||||
|
|
||||||
struct Case {
|
|
||||||
const char* str;
|
|
||||||
const Mu::Tokens tokens;
|
|
||||||
};
|
|
||||||
|
|
||||||
using CaseVec = std::vector<Case>;
|
|
||||||
|
|
||||||
using namespace Mu;
|
|
||||||
using TT = Token::Type;
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_cases(const CaseVec& cases)
|
|
||||||
{
|
|
||||||
for (const auto& casus : cases) {
|
|
||||||
const auto tokens = tokenize(casus.str);
|
|
||||||
|
|
||||||
g_assert_cmpuint((guint)tokens.size(), ==, (guint)casus.tokens.size());
|
|
||||||
for (size_t u = 0; u != tokens.size(); ++u) {
|
|
||||||
if (g_test_verbose()) {
|
|
||||||
std::cerr << "case " << u << " " << casus.str << std::endl;
|
|
||||||
std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl;
|
|
||||||
std::cerr << "got: '" << tokens[u] << "'" << std::endl;
|
|
||||||
}
|
|
||||||
g_assert_true(tokens[u] == casus.tokens[u]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_basic()
|
|
||||||
{
|
|
||||||
CaseVec cases = {
|
|
||||||
{"", {}},
|
|
||||||
|
|
||||||
{"foo", Tokens{Token{3, TT::Data, "foo"}}},
|
|
||||||
|
|
||||||
{"foo bar cuux",
|
|
||||||
Tokens{Token{3, TT::Data, "foo"},
|
|
||||||
Token{7, TT::Data, "bar"},
|
|
||||||
Token{12, TT::Data, "cuux"}}},
|
|
||||||
|
|
||||||
{"\"foo bar\"", Tokens{Token{9, TT::Data, "foo bar"}}},
|
|
||||||
|
|
||||||
// ie. ignore missing closing '"'
|
|
||||||
{"\"foo bar", Tokens{Token{8, TT::Data, "foo bar"}}},
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_specials()
|
|
||||||
{
|
|
||||||
CaseVec cases = {
|
|
||||||
{")*(",
|
|
||||||
Tokens{Token{1, TT::Close, ")"}, Token{2, TT::Data, "*"}, Token{3, TT::Open, "("}}},
|
|
||||||
{"\")*(\"", Tokens{Token{5, TT::Data, ")*("}}},
|
|
||||||
};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_ops()
|
|
||||||
{
|
|
||||||
CaseVec cases = {{"foo and bar oR cuux XoR fnorb",
|
|
||||||
Tokens{Token{3, TT::Data, "foo"},
|
|
||||||
Token{7, TT::And, "and"},
|
|
||||||
Token{11, TT::Data, "bar"},
|
|
||||||
Token{14, TT::Or, "oR"},
|
|
||||||
Token{19, TT::Data, "cuux"},
|
|
||||||
Token{23, TT::Xor, "XoR"},
|
|
||||||
Token{29, TT::Data, "fnorb"}}},
|
|
||||||
{"NOT (aap or mies)",
|
|
||||||
Tokens{Token{3, TT::Not, "NOT"},
|
|
||||||
Token{5, TT::Open, "("},
|
|
||||||
Token{8, TT::Data, "aap"},
|
|
||||||
Token{11, TT::Or, "or"},
|
|
||||||
Token{16, TT::Data, "mies"},
|
|
||||||
Token{17, TT::Close, ")"}}}};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_escape()
|
|
||||||
{
|
|
||||||
CaseVec cases = {{"foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}},
|
|
||||||
{"\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}},
|
|
||||||
{"\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}},
|
|
||||||
{"foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}}};
|
|
||||||
|
|
||||||
test_cases(cases);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
test_to_string()
|
|
||||||
{
|
|
||||||
std::stringstream ss;
|
|
||||||
for (auto&& t : tokenize("foo and bar xor not cuux or fnorb"))
|
|
||||||
ss << t << ' ';
|
|
||||||
|
|
||||||
g_assert_true(ss.str() == "3: <data> [foo] 7: <and> [and] 11: <data> [bar] "
|
|
||||||
"15: <xor> [xor] 19: <not> [not] 24: <data> [cuux] "
|
|
||||||
"27: <or> [or] 33: <data> [fnorb] ");
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
main(int argc, char* argv[])
|
|
||||||
{
|
|
||||||
g_test_init(&argc, &argv, NULL);
|
|
||||||
|
|
||||||
g_test_add_func("/tokens/basic", test_basic);
|
|
||||||
g_test_add_func("/tokens/specials", test_specials);
|
|
||||||
g_test_add_func("/tokens/ops", test_ops);
|
|
||||||
g_test_add_func("/tokens/escape", test_escape);
|
|
||||||
g_test_add_func("/tokens/to-string", test_to_string);
|
|
||||||
|
|
||||||
return g_test_run();
|
|
||||||
}
|
|
|
@ -1,38 +0,0 @@
|
||||||
/*
|
|
||||||
** Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
|
||||||
**
|
|
||||||
** This library is free software; you can redistribute it and/or
|
|
||||||
** modify it under the terms of the GNU Lesser General Public License
|
|
||||||
** as published by the Free Software Foundation; either version 2.1
|
|
||||||
** of the License, or (at your option) any later version.
|
|
||||||
**
|
|
||||||
** This library is distributed in the hope that it will be useful,
|
|
||||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
** Lesser General Public License for more details.
|
|
||||||
**
|
|
||||||
** You should have received a copy of the GNU Lesser General Public
|
|
||||||
** License along with this library; if not, write to the Free
|
|
||||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
|
||||||
** 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include "mu-tokenizer.hh"
|
|
||||||
|
|
||||||
int
|
|
||||||
main(int argc, char* argv[])
|
|
||||||
{
|
|
||||||
std::string s;
|
|
||||||
|
|
||||||
for (auto i = 1; i < argc; ++i)
|
|
||||||
s += " " + std::string(argv[i]);
|
|
||||||
|
|
||||||
const auto tvec = Mu::tokenize(s);
|
|
||||||
for (const auto& t : tvec)
|
|
||||||
std::cout << t << std::endl;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -94,9 +94,12 @@ Mu::mu_test_init(int *argc, char ***argv)
|
||||||
{
|
{
|
||||||
const auto tmpdir{test_random_tmpdir()};
|
const auto tmpdir{test_random_tmpdir()};
|
||||||
|
|
||||||
|
g_unsetenv("XAPIAN_CJK_NGRAM");
|
||||||
g_setenv("MU_TEST", "yes", TRUE);
|
g_setenv("MU_TEST", "yes", TRUE);
|
||||||
g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE);
|
g_setenv("XDG_CACHE_HOME", tmpdir.c_str(), TRUE);
|
||||||
|
|
||||||
|
setlocale(LC_ALL, "");
|
||||||
|
|
||||||
g_test_init(argc, argv, NULL);
|
g_test_init(argc, argv, NULL);
|
||||||
|
|
||||||
g_test_bug_base("https://github.com/djcb/mu/issues/");
|
g_test_bug_base("https://github.com/djcb/mu/issues/");
|
||||||
|
|
|
@ -0,0 +1,127 @@
|
||||||
|
// borrowed from Xapian; slightly adapted
|
||||||
|
|
||||||
|
/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
|
||||||
|
* Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
|
||||||
|
* Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
|
||||||
|
* Copyright (c) 2011,2018,2019,2023 Olly Betts
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* deal in the Software without restriction, including without limitation the
|
||||||
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||||
|
* sell copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MU_UNBROKEN_HH__
|
||||||
|
#define MU_UNBROKEN_HH__
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does unichar p belong to a script without explicit word separators?
|
||||||
|
*
|
||||||
|
* @param p
|
||||||
|
*
|
||||||
|
* @return true or false
|
||||||
|
*/
|
||||||
|
constexpr bool
|
||||||
|
is_unbroken_script(unsigned p)
|
||||||
|
{
|
||||||
|
// Array containing the last value in each range of codepoints which
|
||||||
|
// are either all in scripts which are written without explicit word
|
||||||
|
// breaks, or all not in such scripts.
|
||||||
|
//
|
||||||
|
// We only include scripts here which ICU has dictionaries for. The
|
||||||
|
// same list is currently also used to decide which languages to do
|
||||||
|
// ngrams for, though perhaps that should use a separate list.
|
||||||
|
constexpr unsigned splits[] = {
|
||||||
|
// 0E00..0E7F; Thai, Lanna Tai, Pali
|
||||||
|
// 0E80..0EFF; Lao
|
||||||
|
0x0E00 - 1, 0x0EFF,
|
||||||
|
// 1000..109F; Myanmar (Burmese)
|
||||||
|
0x1000 - 1, 0x109F,
|
||||||
|
// 1100..11FF; Hangul Jamo
|
||||||
|
0x1100 - 1, 0x11FF,
|
||||||
|
// 1780..17FF; Khmer
|
||||||
|
0x1780 - 1, 0x17FF,
|
||||||
|
// 19E0..19FF; Khmer Symbols
|
||||||
|
0x19E0 - 1, 0x19FF,
|
||||||
|
// 2E80..2EFF; CJK Radicals Supplement
|
||||||
|
// 2F00..2FDF; Kangxi Radicals
|
||||||
|
// 2FE0..2FFF; Ideographic Description Characters
|
||||||
|
// 3000..303F; CJK Symbols and Punctuation
|
||||||
|
// 3040..309F; Hiragana
|
||||||
|
// 30A0..30FF; Katakana
|
||||||
|
// 3100..312F; Bopomofo
|
||||||
|
// 3130..318F; Hangul Compatibility Jamo
|
||||||
|
// 3190..319F; Kanbun
|
||||||
|
// 31A0..31BF; Bopomofo Extended
|
||||||
|
// 31C0..31EF; CJK Strokes
|
||||||
|
// 31F0..31FF; Katakana Phonetic Extensions
|
||||||
|
// 3200..32FF; Enclosed CJK Letters and Months
|
||||||
|
// 3300..33FF; CJK Compatibility
|
||||||
|
// 3400..4DBF; CJK Unified Ideographs Extension A
|
||||||
|
// 4DC0..4DFF; Yijing Hexagram Symbols
|
||||||
|
// 4E00..9FFF; CJK Unified Ideographs
|
||||||
|
0x2E80 - 1, 0x9FFF,
|
||||||
|
// A700..A71F; Modifier Tone Letters
|
||||||
|
0xA700 - 1, 0xA71F,
|
||||||
|
// A960..A97F; Hangul Jamo Extended-A
|
||||||
|
0xA960 - 1, 0xA97F,
|
||||||
|
// A9E0..A9FF; Myanmar Extended-B (Burmese)
|
||||||
|
0xA9E0 - 1, 0xA9FF,
|
||||||
|
// AA60..AA7F; Myanmar Extended-A (Burmese)
|
||||||
|
0xAA60 - 1, 0xAA7F,
|
||||||
|
// AC00..D7AF; Hangul Syllables
|
||||||
|
// D7B0..D7FF; Hangul Jamo Extended-B
|
||||||
|
0xAC00 - 1, 0xD7FF,
|
||||||
|
// F900..FAFF; CJK Compatibility Ideographs
|
||||||
|
0xF900 - 1, 0xFAFF,
|
||||||
|
// FE30..FE4F; CJK Compatibility Forms
|
||||||
|
0xFE30 - 1, 0xFE4F,
|
||||||
|
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
||||||
|
0xFF00 - 1, 0xFFEF,
|
||||||
|
// 1AFF0..1AFFF; Kana Extended-B
|
||||||
|
// 1B000..1B0FF; Kana Supplement
|
||||||
|
// 1B100..1B12F; Kana Extended-A
|
||||||
|
// 1B130..1B16F; Small Kana Extension
|
||||||
|
0x1AFF0 - 1, 0x1B16F,
|
||||||
|
// 1F200..1F2FF; Enclosed Ideographic Supplement
|
||||||
|
0x1F200 - 1, 0x1F2FF,
|
||||||
|
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
||||||
|
0x20000 - 1, 0x2A6DF,
|
||||||
|
// 2A700..2B73F; CJK Unified Ideographs Extension C
|
||||||
|
// 2B740..2B81F; CJK Unified Ideographs Extension D
|
||||||
|
// 2B820..2CEAF; CJK Unified Ideographs Extension E
|
||||||
|
// 2CEB0..2EBEF; CJK Unified Ideographs Extension F
|
||||||
|
0x2A700 - 1, 0x2EBEF,
|
||||||
|
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||||
|
0x2F800 - 1, 0x2FA1F,
|
||||||
|
// 30000..3134F; CJK Unified Ideographs Extension G
|
||||||
|
// 31350..323AF; CJK Unified Ideographs Extension H
|
||||||
|
0x30000 - 1, 0x323AF
|
||||||
|
};
|
||||||
|
// Binary chop to find the first entry which is >= p. If it's an odd
|
||||||
|
// offset then the codepoint is in a script which needs splitting; if it's
|
||||||
|
// an even offset then it's not.
|
||||||
|
auto it = std::lower_bound(std::begin(splits),
|
||||||
|
std::end(splits), p);
|
||||||
|
|
||||||
|
return ((it - splits) & 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* MU_UNBROKEN_HH__ */
|
|
@ -44,6 +44,8 @@
|
||||||
#include <glib/gprintf.h>
|
#include <glib/gprintf.h>
|
||||||
|
|
||||||
#include "mu-utils.hh"
|
#include "mu-utils.hh"
|
||||||
|
#include "mu-unbroken.hh"
|
||||||
|
|
||||||
#include "mu-error.hh"
|
#include "mu-error.hh"
|
||||||
#include "mu-option.hh"
|
#include "mu-option.hh"
|
||||||
|
|
||||||
|
@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len)
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
bool
|
||||||
|
Mu::contains_unbroken_script(const char *str)
|
||||||
|
{
|
||||||
|
while (str && *str) {
|
||||||
|
auto uc = g_utf8_get_char(str);
|
||||||
|
if (is_unbroken_script(uc))
|
||||||
|
return true;
|
||||||
|
str = g_utf8_next_char(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
std::string // gx_utf8_flatten
|
std::string // gx_utf8_flatten
|
||||||
Mu::utf8_flatten(const char* str)
|
Mu::utf8_flatten(const char* str)
|
||||||
{
|
{
|
||||||
if (!str)
|
if (!str)
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
|
if (contains_unbroken_script(str))
|
||||||
|
return std::string{str};
|
||||||
|
|
||||||
// the pure-ascii case
|
// the pure-ascii case
|
||||||
if (g_str_is_ascii(str)) {
|
if (g_str_is_ascii(str)) {
|
||||||
auto l = g_ascii_strdown(str, -1);
|
auto l = g_ascii_strdown(str, -1);
|
||||||
|
|
|
@ -154,7 +154,19 @@ std::tm mu_time(T t={}, bool use_utc=false) {
|
||||||
using StringVec = std::vector<std::string>;
|
using StringVec = std::vector<std::string>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Flatten a string -- downcase and fold diacritics etc.
|
* Does the string contain script without explicit word separators?
|
||||||
|
*
|
||||||
|
* @param str a string
|
||||||
|
*
|
||||||
|
* @return true or false
|
||||||
|
*/
|
||||||
|
bool contains_unbroken_script(const char* str);
|
||||||
|
static inline bool contains_unbroken_script(const std::string& str) {
|
||||||
|
return contains_unbroken_script(str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flatten a string -- down-case and fold diacritics.
|
||||||
*
|
*
|
||||||
* @param str a string
|
* @param str a string
|
||||||
*
|
*
|
||||||
|
|
|
@ -45,14 +45,8 @@ test_cases(const CaseVec& cases, ProcFunc proc)
|
||||||
{
|
{
|
||||||
for (const auto& casus : cases) {
|
for (const auto& casus : cases) {
|
||||||
const auto res = proc(casus.expr, casus.is_first);
|
const auto res = proc(casus.expr, casus.is_first);
|
||||||
if (g_test_verbose()) {
|
//mu_println("'{}'\n'{}'", casus.expected, res);
|
||||||
std::cout << "\n";
|
assert_equal(casus.expected, res);
|
||||||
std::cout << casus.expr << ' ' << casus.is_first << std::endl;
|
|
||||||
std::cout << "exp: '" << casus.expected << "'" << std::endl;
|
|
||||||
std::cout << "got: '" << res << "'" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
g_assert_true(casus.expected == res);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,6 +155,8 @@ test_flatten()
|
||||||
{"Менделе́ев", true, "менделеев"},
|
{"Менделе́ев", true, "менделеев"},
|
||||||
{"", false, ""},
|
{"", false, ""},
|
||||||
{"Ångström", true, "angstrom"},
|
{"Ångström", true, "angstrom"},
|
||||||
|
// don't touch combining characters in CJK etc.
|
||||||
|
{"スポンサーシップ募集",true, "スポンサーシップ募集"}
|
||||||
};
|
};
|
||||||
|
|
||||||
test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });
|
test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });
|
||||||
|
|
|
@ -119,7 +119,7 @@ entries are displayed.
|
||||||
** --summary-len=<number>
|
** --summary-len=<number>
|
||||||
If > 0, use that number of lines of the message to provide a summary.
|
If > 0, use that number of lines of the message to provide a summary.
|
||||||
|
|
||||||
** --format=<plain|links|xquery|xml|sexp>
|
** --format=<plain|links|xml|sexp>
|
||||||
|
|
||||||
output results in the specified format:
|
output results in the specified format:
|
||||||
|
|
||||||
|
@ -129,9 +129,7 @@ output results in the specified format:
|
||||||
information).
|
information).
|
||||||
- *xml* formats the search results as XML.
|
- *xml* formats the search results as XML.
|
||||||
- *sexp* formats the search results as an s-expression as used in Lisp programming
|
- *sexp* formats the search results as an s-expression as used in Lisp programming
|
||||||
environments.
|
environments
|
||||||
- *xquery* shows the Xapian query corresponding to your search terms. This is
|
|
||||||
meant for for debugging purposes.
|
|
||||||
|
|
||||||
** --linksdir=<dir> and -c, --clearlinks
|
** --linksdir=<dir> and -c, --clearlinks
|
||||||
when using ~-format=links~, output the results as a maildir with symbolic links to
|
when using ~-format=links~, output the results as a maildir with symbolic links to
|
||||||
|
@ -215,6 +213,14 @@ not this may not really be the same message, if the message-id was copied.
|
||||||
The algorithm used for determining the threads is based on Jamie Zawinksi's
|
The algorithm used for determining the threads is based on Jamie Zawinksi's
|
||||||
description: http://www.jwz.org/doc/threading.html
|
description: http://www.jwz.org/doc/threading.html
|
||||||
|
|
||||||
|
** -a,--analyze
|
||||||
|
instead of executing the query, analyze it by show the parse-tree s-expression
|
||||||
|
and a stringified version of the Xapian query. This can help users to determine
|
||||||
|
how ~mu~ interprets some query.
|
||||||
|
|
||||||
|
The output of this command are differ between versions, but should be helpful
|
||||||
|
nevertheless.
|
||||||
|
|
||||||
#+include: "muhome.inc" :minlevel 2
|
#+include: "muhome.inc" :minlevel 2
|
||||||
|
|
||||||
#+include: "common-options.inc" :minlevel 1
|
#+include: "common-options.inc" :minlevel 1
|
||||||
|
|
|
@ -17,6 +17,7 @@ has completed, you can run *mu index*
|
||||||
* INIT OPTIONS
|
* INIT OPTIONS
|
||||||
|
|
||||||
** -m, --maildir=<maildir>
|
** -m, --maildir=<maildir>
|
||||||
|
|
||||||
starts searching at =<maildir>=. By default, *mu* uses whatever the *MAILDIR*
|
starts searching at =<maildir>=. By default, *mu* uses whatever the *MAILDIR*
|
||||||
environment variable is set to; if it is not set, it tries =~/Maildir= if it
|
environment variable is set to; if it is not set, it tries =~/Maildir= if it
|
||||||
already exists.
|
already exists.
|
||||||
|
@ -54,6 +55,13 @@ number of changes after which they are committed to the database; decreasing
|
||||||
this reduces the memory requirements, but make indexing substantially slows (and
|
this reduces the memory requirements, but make indexing substantially slows (and
|
||||||
vice-versa for increasing). Usually, the default of 250000 should be fine.
|
vice-versa for increasing). Usually, the default of 250000 should be fine.
|
||||||
|
|
||||||
|
** --support-ngrams
|
||||||
|
|
||||||
|
whether to enable support for using ngrams in indexing and query parsing; this
|
||||||
|
can be useful for languages without explicit word-breaks, such as
|
||||||
|
Chinese/Japanes/Korean. See *NGRAM SUPPORT* below.
|
||||||
|
|
||||||
|
|
||||||
** --reinit
|
** --reinit
|
||||||
|
|
||||||
reinitialize the database from an earlier version; that is, create a new empty
|
reinitialize the database from an earlier version; that is, create a new empty
|
||||||
|
@ -62,8 +70,20 @@ options.
|
||||||
|
|
||||||
#+include: "muhome.inc" :minlevel 2
|
#+include: "muhome.inc" :minlevel 2
|
||||||
|
|
||||||
|
* NGRAM SUPPORT
|
||||||
|
|
||||||
|
*mu*'s underlying Xapian database supports 'ngrams', which improve searching for
|
||||||
|
languages/scripts that do not have explicit word breaks, such as Chinese,
|
||||||
|
Japanese and Korean. It is fairly intrusive, and influence both indexing and
|
||||||
|
query-parsing; it is not enabled by default, and is recommended only if you need
|
||||||
|
to search in such languages.
|
||||||
|
|
||||||
|
When enabled, *mu* automatically uses ngrams automatically. Xapian environment
|
||||||
|
variables such as ~XAPIAN_CJK_NGRAM~ are ignored.
|
||||||
|
|
||||||
#+include: "exit-code.inc" :minlevel 1
|
#+include: "exit-code.inc" :minlevel 1
|
||||||
|
|
||||||
|
|
||||||
* EXAMPLE
|
* EXAMPLE
|
||||||
#+begin_example
|
#+begin_example
|
||||||
$ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/'
|
$ mu init --maildir=~/Maildir --my-address=alice@example.com --my-address=bob@example.com --ignored-address='/.*reply.*/'
|
||||||
|
|
|
@ -25,8 +25,8 @@ quote any characters that would otherwise be interpreted by the shell, such as
|
||||||
* TERMS
|
* TERMS
|
||||||
|
|
||||||
The basic building blocks of a query are *terms*; these are just normal words like
|
The basic building blocks of a query are *terms*; these are just normal words like
|
||||||
'banana' or 'hello', or words prefixed with a field-name which make them apply
|
'banana' or 'hello', or words prefixed with a field-name which makes them apply
|
||||||
to just that field. See *mu find* for all the available fields.
|
to just that field. See *mu info fields* for all the available fields.
|
||||||
|
|
||||||
Some example queries:
|
Some example queries:
|
||||||
#+begin_example
|
#+begin_example
|
||||||
|
@ -60,9 +60,8 @@ mu find subject:\\"hi there\\"
|
||||||
* LOGICAL OPERATORS
|
* LOGICAL OPERATORS
|
||||||
|
|
||||||
We can combine terms with logical operators -- binary ones: *and*, *or*, *xor* and the
|
We can combine terms with logical operators -- binary ones: *and*, *or*, *xor* and the
|
||||||
unary *not*, with the conventional rules for precedence and association, and are
|
unary *not*, with the conventional rules for precedence and association. The
|
||||||
case-insensitive.
|
operators are case-insensitive.
|
||||||
|
|
||||||
|
|
||||||
You can also group things with *(* and *)*, so you can do things like:
|
You can also group things with *(* and *)*, so you can do things like:
|
||||||
#+begin_example
|
#+begin_example
|
||||||
|
@ -86,6 +85,7 @@ Note that a =pure not= - e.g. searching for *not apples* is quite a 'heavy' quer
|
||||||
The language supports matching basic PCRE regular expressions, see *pcre(3)*.
|
The language supports matching basic PCRE regular expressions, see *pcre(3)*.
|
||||||
|
|
||||||
Regular expressions are enclosed in *//*. Some examples:
|
Regular expressions are enclosed in *//*. Some examples:
|
||||||
|
|
||||||
#+begin_example
|
#+begin_example
|
||||||
subject:/h.llo/ # match hallo, hello, ...
|
subject:/h.llo/ # match hallo, hello, ...
|
||||||
subject:/
|
subject:/
|
||||||
|
@ -96,10 +96,10 @@ matches messages in the '/foo' maildir, while the latter matches all messages in
|
||||||
all maildirs that match 'foo', such as '/foo', '/bar/cuux/foo', '/fooishbar'
|
all maildirs that match 'foo', such as '/foo', '/bar/cuux/foo', '/fooishbar'
|
||||||
etc.
|
etc.
|
||||||
|
|
||||||
Wildcards are an older mechanism for matching where a term with a rightmost ***
|
Wildcards are another mechanism for matching where a term with a rightmost ***
|
||||||
(and =only= in that position) matches any term that starts with the part before
|
(and =only= in that position) matches any term that starts with the part before
|
||||||
the ***; they are supported for backward compatibility and *mu* translates them to
|
the ***; they are therefore less powerful than regular expressions, but also much
|
||||||
regular expressions internally:
|
faster:
|
||||||
#+begin_example
|
#+begin_example
|
||||||
foo*
|
foo*
|
||||||
#+end_example
|
#+end_example
|
||||||
|
@ -108,8 +108,7 @@ is equivalent to
|
||||||
/foo.*/
|
/foo.*/
|
||||||
#+end_example
|
#+end_example
|
||||||
|
|
||||||
As a note of caution, certain wild-cards and regular expression can take quite a
|
Regular expressions can be useful, but are relatively slow.
|
||||||
bit longer than 'normal' queries.
|
|
||||||
|
|
||||||
* FIELDS
|
* FIELDS
|
||||||
|
|
||||||
|
@ -143,8 +142,8 @@ full table with all details, including single-char shortcuts, try the command:
|
||||||
| to | | Message recipient |
|
| to | | Message recipient |
|
||||||
|------------+-----------+--------------------------------|
|
|------------+-----------+--------------------------------|
|
||||||
|
|
||||||
(*) The language code for the text-body if found. This works only
|
(*) The language code for the text-body if found. This works only if ~mu~ was
|
||||||
if ~mu~ was built with CLD2 support.
|
built with CLD2 support.
|
||||||
|
|
||||||
There are also the special fields *contact:*, which matches all contact-fields
|
There are also the special fields *contact:*, which matches all contact-fields
|
||||||
(=from=, =to=, =cc= and =bcc=), and *recip*, which matches all recipient-fields (=to=, =cc=
|
(=from=, =to=, =cc= and =bcc=), and *recip*, which matches all recipient-fields (=to=, =cc=
|
||||||
|
@ -167,12 +166,12 @@ separated by *..*. Either lower or upper (but not both) can be omitted to create
|
||||||
an open range.
|
an open range.
|
||||||
|
|
||||||
Dates are expressed in local time and using ISO-8601 format (YYYY-MM-DD
|
Dates are expressed in local time and using ISO-8601 format (YYYY-MM-DD
|
||||||
HH:MM:SS); you can leave out the right part, and *mu* adds the rest, depending on
|
HH:MM:SS); you can leave out the right part and *mu* adds the rest, depending on
|
||||||
whether this is the beginning or end of the range (e.g., as a lower bound,
|
whether this is the beginning or end of the range (e.g., as a lower bound,
|
||||||
'2015' would be interpreted as the start of that year; as an upper bound as the
|
'2015' would be interpreted as the start of that year; as an upper bound as the
|
||||||
end of the year).
|
end of the year).
|
||||||
|
|
||||||
You can use '/' , '.', '-' and 'T' to make dates more human readable.
|
You can use '/' , '.', '-', ':' and 'T' to make dates more human-readable.
|
||||||
|
|
||||||
Some examples:
|
Some examples:
|
||||||
#+begin_example
|
#+begin_example
|
||||||
|
@ -274,6 +273,9 @@ Note that from the command-line, such queries must be quoted:
|
||||||
mu find 'maildir:"/Sent Items"'
|
mu find 'maildir:"/Sent Items"'
|
||||||
#+end_example
|
#+end_example
|
||||||
|
|
||||||
|
Also note that you should *not* end the maildir with a ~/~, or it can be
|
||||||
|
misinterpreted as a regular expression term; see aforementioned.
|
||||||
|
|
||||||
* MORE EXAMPLES
|
* MORE EXAMPLES
|
||||||
|
|
||||||
Here are some simple examples of *mu* queries; you can make many more complicated
|
Here are some simple examples of *mu* queries; you can make many more complicated
|
||||||
|
@ -321,16 +323,25 @@ Find all messages written in Dutch or German with the word 'hallo':
|
||||||
hallo and (lang:nl or lang:de)
|
hallo and (lang:nl or lang:de)
|
||||||
#+end_example
|
#+end_example
|
||||||
|
|
||||||
|
* ANALZYING QUERIES
|
||||||
|
|
||||||
* CAVEATS
|
Despite all the documentation, in some cases it can be non-obvious how ~mu~
|
||||||
|
interprets a certain query. For that, you can ask ~mu~ to analyze the query --
|
||||||
|
that is, show how ~mu~ interprets the query.
|
||||||
|
|
||||||
With current Xapian versions, the apostroph character is considered part of a
|
This uses the the ~--analyze~ option to *mu find*.
|
||||||
word. Thus, you cannot find =D'Artagnan= by searching for =Artagnan=. So, include
|
#+begin_example
|
||||||
the apostrophe in search or use a regexp search.
|
$ mu find subject:wombat AND date:3m.. size:..2000 --analyze
|
||||||
|
* query:
|
||||||
|
subject:wombat AND date:3m.. size:..2000
|
||||||
|
* parsed query:
|
||||||
|
(and (subject "wombat") (date (range "2023-05-30T06:10:09Z" "")) (size (range "" "2000")))
|
||||||
|
* Xapian query:
|
||||||
|
Query((Swombat AND VALUE_GE 4 n64759341 AND VALUE_LE 17 i7d0))
|
||||||
|
#+end_example
|
||||||
|
|
||||||
Matching on spaces has changed compared to the old query-parser; this applies
|
The ~parsed query~ is usually the most interesting one to understand what's
|
||||||
e.g. to Maildirs that have spaces in their name, such as =Sent Items=. See *MAILDIR*
|
happening.
|
||||||
above.
|
|
||||||
|
|
||||||
#+include: "prefooter.inc" :minlevel 1
|
#+include: "prefooter.inc" :minlevel 1
|
||||||
|
|
||||||
|
|
10
meson.build
10
meson.build
|
@ -149,9 +149,17 @@ gobject_dep = dependency('gobject-2.0', version: '>= 2.60')
|
||||||
gio_dep = dependency('gio-2.0', version: '>= 2.60')
|
gio_dep = dependency('gio-2.0', version: '>= 2.60')
|
||||||
gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60')
|
gio_unix_dep = dependency('gio-unix-2.0', version: '>= 2.60')
|
||||||
gmime_dep = dependency('gmime-3.0', version: '>= 3.2')
|
gmime_dep = dependency('gmime-3.0', version: '>= 3.2')
|
||||||
xapian_dep = dependency('xapian-core', version:'>= 1.4')
|
|
||||||
thread_dep = dependency('threads')
|
thread_dep = dependency('threads')
|
||||||
|
|
||||||
|
# we need Xapian 1.4; if we have 1.4.23, we have some newer APIs.
|
||||||
|
xapian_dep = dependency('xapian-core', version:'>= 1.4.23', required:false)
|
||||||
|
if xapian_dep.found()
|
||||||
|
config_h_data.set('HAVE_XAPIAN_FLAG_NGRAMS', 1)
|
||||||
|
else
|
||||||
|
xapian_dep = dependency('xapian-core', version:'>= 1.4')
|
||||||
|
message('Found xapian ' + xapian_dep.version())
|
||||||
|
endif
|
||||||
|
|
||||||
# optionally, use Compact Language Detector2 if we can find it.
|
# optionally, use Compact Language Detector2 if we can find it.
|
||||||
cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false)
|
cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false)
|
||||||
if cld2_dep.found()
|
if cld2_dep.found()
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
#include "mu-query-match-deciders.hh"
|
#include "mu-query-match-deciders.hh"
|
||||||
#include "mu-query.hh"
|
#include "mu-query.hh"
|
||||||
#include "mu-bookmarks.hh"
|
#include "mu-bookmarks.hh"
|
||||||
|
#include "mu-query-parser.hh"
|
||||||
#include "message/mu-message.hh"
|
#include "message/mu-message.hh"
|
||||||
|
|
||||||
#include "utils/mu-option.hh"
|
#include "utils/mu-option.hh"
|
||||||
|
@ -61,12 +62,30 @@ using OutputFunc = std::function<Result<void>(const Option<Message>& msg, const
|
||||||
using Format = Options::Find::Format;
|
using Format = Options::Find::Format;
|
||||||
|
|
||||||
static Result<void>
|
static Result<void>
|
||||||
print_internal(const Store& store,
|
analyze_query_expr(const Store& store, const std::string& expr, const Options& opts)
|
||||||
const std::string& expr,
|
|
||||||
bool xapian,
|
|
||||||
bool warn)
|
|
||||||
{
|
{
|
||||||
mu_println("{}", store.parse_query(expr, xapian));
|
auto print_item=[&](auto&&title, auto&&val) {
|
||||||
|
const auto blue{opts.nocolor ? "" : MU_COLOR_BLUE};
|
||||||
|
const auto green{opts.nocolor ? "" : MU_COLOR_GREEN};
|
||||||
|
const auto reset{opts.nocolor ? "" : MU_COLOR_DEFAULT};
|
||||||
|
mu_println("* {}{}{}:\n {}{}{}", blue, title, reset, green, val, reset);
|
||||||
|
};
|
||||||
|
|
||||||
|
print_item("query", expr);
|
||||||
|
|
||||||
|
const auto pq{parse_query(expr, false/*don't expand*/).to_string()};
|
||||||
|
const auto pqx{parse_query(expr, true/*do expand*/).to_string()};
|
||||||
|
|
||||||
|
print_item("parsed query", pq);
|
||||||
|
if (pq != pqx)
|
||||||
|
print_item("parsed query (expanded)", pqx);
|
||||||
|
|
||||||
|
auto xq{make_xapian_query(store, expr)};
|
||||||
|
if (!xq)
|
||||||
|
return Err(std::move(xq.error()));
|
||||||
|
|
||||||
|
print_item("Xapian query", xq->get_description());
|
||||||
|
|
||||||
return Ok();
|
return Ok();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -473,7 +492,7 @@ output_query_results(const QueryResults& qres, const Options& opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
static Result<void>
|
static Result<void>
|
||||||
process_query(const Store& store, const std::string& expr, const Options& opts)
|
process_store_query(const Store& store, const std::string& expr, const Options& opts)
|
||||||
{
|
{
|
||||||
auto qres{run_query(store, expr, opts)};
|
auto qres{run_query(store, expr, opts)};
|
||||||
if (!qres)
|
if (!qres)
|
||||||
|
@ -492,18 +511,14 @@ Mu::mu_cmd_find(const Store& store, const Options& opts)
|
||||||
if (!expr)
|
if (!expr)
|
||||||
return Err(expr.error());
|
return Err(expr.error());
|
||||||
|
|
||||||
if (opts.find.format == Format::XQuery)
|
if (opts.find.analyze)
|
||||||
return print_internal(store, *expr, true, false);
|
return analyze_query_expr(store, *expr, opts);
|
||||||
else if (opts.find.format == Format::MQuery)
|
|
||||||
return print_internal(store, *expr, false, opts.verbose);
|
|
||||||
else
|
else
|
||||||
return process_query(store, *expr, opts);
|
return process_store_query(store, *expr, opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef BUILD_TESTS
|
#ifdef BUILD_TESTS
|
||||||
/*
|
/*
|
||||||
* Tests.
|
* Tests.
|
||||||
|
|
|
@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts)
|
||||||
info.add_row({"ignored-address", c});
|
info.add_row({"ignored-address", c});
|
||||||
|
|
||||||
info.add_row({"messages in store", mu_format("{}", store.size())});
|
info.add_row({"messages in store", mu_format("{}", store.size())});
|
||||||
|
info.add_row({"support-ngrams", conf.get<Config::Id::SupportNgrams>() ? "yes" : "no"});
|
||||||
|
|
||||||
info.add_row({"last-change", tstamp(store.statistics().last_change)});
|
info.add_row({"last-change", tstamp(store.statistics().last_change)});
|
||||||
info.add_row({"last-index", tstamp(store.statistics().last_index)});
|
info.add_row({"last-index", tstamp(store.statistics().last_index)});
|
||||||
|
|
||||||
|
|
|
@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts)
|
||||||
conf.set<Config::Id::PersonalAddresses>(opts.init.my_addresses);
|
conf.set<Config::Id::PersonalAddresses>(opts.init.my_addresses);
|
||||||
if (!opts.init.ignored_addresses.empty())
|
if (!opts.init.ignored_addresses.empty())
|
||||||
conf.set<Config::Id::IgnoredAddresses>(opts.init.ignored_addresses);
|
conf.set<Config::Id::IgnoredAddresses>(opts.init.ignored_addresses);
|
||||||
|
if (opts.init.support_ngrams)
|
||||||
|
conf.set<Config::Id::SupportNgrams>(true);
|
||||||
|
|
||||||
return Store::make_new(opts.runtime_path(RuntimePath::XapianDb),
|
return Store::make_new(opts.runtime_path(RuntimePath::XapianDb),
|
||||||
opts.init.maildir, conf);
|
opts.init.maildir, conf);
|
||||||
|
|
|
@ -337,12 +337,6 @@ sub_find(CLI::App& sub, Options& opts)
|
||||||
{ Format::Json,
|
{ Format::Json,
|
||||||
{"json", "JSON"}
|
{"json", "JSON"}
|
||||||
},
|
},
|
||||||
{ Format::XQuery,
|
|
||||||
{"xquery", "Show Xapian query (for debugging)"}
|
|
||||||
},
|
|
||||||
{ Format::MQuery,
|
|
||||||
{"mquery", "Show mu query for (for debugging)"}
|
|
||||||
},
|
|
||||||
}};
|
}};
|
||||||
|
|
||||||
sub.add_flag("--threads,-t", opts.find.threads,
|
sub.add_flag("--threads,-t", opts.find.threads,
|
||||||
|
@ -351,6 +345,8 @@ sub_find(CLI::App& sub, Options& opts)
|
||||||
"Show only one of messages with same message-id");
|
"Show only one of messages with same message-id");
|
||||||
sub.add_flag("--include-related,-r", opts.find.include_related,
|
sub.add_flag("--include-related,-r", opts.find.include_related,
|
||||||
"Include related messages in results");
|
"Include related messages in results");
|
||||||
|
sub.add_flag("--analyze,-a", opts.find.analyze,
|
||||||
|
"Analyze the query");
|
||||||
|
|
||||||
const auto fhelp = options_help(FormatInfos, Format::Plain);
|
const auto fhelp = options_help(FormatInfos, Format::Plain);
|
||||||
const auto fmap = options_map(FormatInfos);
|
const auto fmap = options_map(FormatInfos);
|
||||||
|
@ -461,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts)
|
||||||
"Maximum allowed message size in bytes");
|
"Maximum allowed message size in bytes");
|
||||||
sub.add_option("--batch-size", opts.init.batch_size,
|
sub.add_option("--batch-size", opts.init.batch_size,
|
||||||
"Maximum size of database transaction");
|
"Maximum size of database transaction");
|
||||||
|
sub.add_option("--support-ngrams", opts.init.support_ngrams,
|
||||||
|
"Support CJK n-grams if for querying/indexing");
|
||||||
sub.add_flag("--reinit", opts.init.reinit,
|
sub.add_flag("--reinit", opts.init.reinit,
|
||||||
"Re-initialize database with current settings")
|
"Re-initialize database with current settings")
|
||||||
->excludes("--maildir")
|
->excludes("--maildir")
|
||||||
->excludes("--my-address")
|
->excludes("--my-address")
|
||||||
->excludes("--ignored-address")
|
->excludes("--ignored-address")
|
||||||
->excludes("--max-message-size")
|
->excludes("--max-message-size")
|
||||||
->excludes("--batch-size");
|
->excludes("--batch-size")
|
||||||
|
->excludes("--support-ngrams");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
|
@ -143,11 +143,12 @@ struct Options {
|
||||||
bool reverse; /**< sort in revers order (z->a) */
|
bool reverse; /**< sort in revers order (z->a) */
|
||||||
bool threads; /**< show message threads */
|
bool threads; /**< show message threads */
|
||||||
bool clearlinks; /**< clear linksdir first */
|
bool clearlinks; /**< clear linksdir first */
|
||||||
std::string linksdir; /**< directory for links */
|
std::string linksdir; /**< directory for links */
|
||||||
OptSize summary_len; /**< max # of lines for summary */
|
OptSize summary_len; /**< max # of lines for summary */
|
||||||
std::string bookmark; /**< use bookmark */
|
std::string bookmark; /**< use bookmark */
|
||||||
|
bool analyze; /**< analyze query */
|
||||||
|
|
||||||
enum struct Format { Plain, Links, Xml, Json, Sexp, XQuery, MQuery, Exec };
|
enum struct Format { Plain, Links, Xml, Json, Sexp, Exec };
|
||||||
Format format; /**< Output format */
|
Format format; /**< Output format */
|
||||||
std::string exec; /**< cmd to execute on matches */
|
std::string exec; /**< cmd to execute on matches */
|
||||||
bool skip_dups; /**< show only first with msg id */
|
bool skip_dups; /**< show only first with msg id */
|
||||||
|
@ -184,13 +185,15 @@ struct Options {
|
||||||
* Init
|
* Init
|
||||||
*/
|
*/
|
||||||
struct Init {
|
struct Init {
|
||||||
std::string maildir; /**< where the mails are */
|
std::string maildir; /**< where the mails are */
|
||||||
StringVec my_addresses; /**< personal e-mail addresses */
|
StringVec my_addresses; /**< personal e-mail addresses */
|
||||||
StringVec ignored_addresses; /**< addresses to be ignored for
|
StringVec ignored_addresses; /**< addresses to be ignored for
|
||||||
* the contacts-cache */
|
* the contacts-cache */
|
||||||
OptSize max_msg_size; /**< max size for message files */
|
OptSize max_msg_size; /**< max size for message files */
|
||||||
OptSize batch_size; /**< db transaction batch size */
|
OptSize batch_size; /**< db transaction batch size */
|
||||||
bool reinit; /**< re-initialize */
|
bool reinit; /**< re-initialize */
|
||||||
|
bool support_ngrams; /**< support CJK etc. ngrams */
|
||||||
|
|
||||||
} init;
|
} init;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
8
mu/mu.cc
8
mu/mu.cc
|
@ -90,6 +90,14 @@ handle_result(const Result<void>& res, const Mu::Options& opts)
|
||||||
int
|
int
|
||||||
main(int argc, char* argv[])
|
main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* We handle this through explicit options
|
||||||
|
*/
|
||||||
|
g_unsetenv("XAPIAN_CJK_NGRAM");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* set up locale
|
||||||
|
*/
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -578,48 +578,11 @@ test_mu_query_threads_compilation_error(void)
|
||||||
3);
|
3);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* https://github.com/djcb/mu/issues/1428 */
|
|
||||||
static void
|
|
||||||
test_mu_query_cjk(void)
|
|
||||||
{
|
|
||||||
/* XXX: this doesn't pass yet; return for now */
|
|
||||||
g_test_skip("skip CJK tests");
|
|
||||||
return;
|
|
||||||
|
|
||||||
{
|
|
||||||
g_unsetenv("XAPIAN_CJK_NGRAM");
|
|
||||||
const auto xpath = make_database(MU_TESTMAILDIR_CJK);
|
|
||||||
g_assert_cmpuint(run_and_count_matches(xpath,
|
|
||||||
"サーバがダウンしました",
|
|
||||||
QueryFlags::None),
|
|
||||||
==, 1);
|
|
||||||
g_assert_cmpuint(run_and_count_matches(xpath,
|
|
||||||
"サーバ",
|
|
||||||
QueryFlags::None),
|
|
||||||
==, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
g_setenv("XAPIAN_CJK_NGRAM", "1", TRUE);
|
|
||||||
const auto xpath = make_database(MU_TESTMAILDIR_CJK);
|
|
||||||
g_assert_cmpuint(run_and_count_matches(xpath,
|
|
||||||
"サーバがダウンしました",
|
|
||||||
QueryFlags::None),
|
|
||||||
==, 0);
|
|
||||||
g_assert_cmpuint(run_and_count_matches(xpath,
|
|
||||||
"サーバ",
|
|
||||||
QueryFlags::None),
|
|
||||||
==, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char* argv[])
|
main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
setlocale(LC_ALL, "");
|
|
||||||
|
|
||||||
mu_test_init(&argc, &argv);
|
mu_test_init(&argc, &argv);
|
||||||
DB_PATH1 = make_database(MU_TESTMAILDIR);
|
DB_PATH1 = make_database(MU_TESTMAILDIR);
|
||||||
g_assert_false(DB_PATH1.empty());
|
g_assert_false(DB_PATH1.empty());
|
||||||
|
@ -661,8 +624,6 @@ main(int argc, char* argv[])
|
||||||
g_test_add_func("/mu-query/test-mu-query-threads-compilation-error",
|
g_test_add_func("/mu-query/test-mu-query-threads-compilation-error",
|
||||||
test_mu_query_threads_compilation_error);
|
test_mu_query_threads_compilation_error);
|
||||||
|
|
||||||
g_test_add_func("/mu-query/test-mu-query-cjk",
|
|
||||||
test_mu_query_cjk);
|
|
||||||
rv = g_test_run();
|
rv = g_test_run();
|
||||||
|
|
||||||
return rv;
|
return rv;
|
||||||
|
|
Loading…
Reference in New Issue