message:document/fields: update and tie down

Update many of the field flags; remove obsolete ones.

Ensure they are handled correctly in mu-document
This commit is contained in:
Dirk-Jan C. Binnema 2022-04-28 22:53:31 +03:00
parent b7a30c0a36
commit 9a8741f0dd
5 changed files with 157 additions and 150 deletions

View File

@ -36,23 +36,24 @@ constexpr char SepaChar1 = 0xfe;
constexpr char SepaChar2 = 0xff;
static void
add_index_term(Xapian::Document& doc, const Field& field, const std::string& val)
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
{
std::string flatval{utf8_flatten(val)};
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(flatval);
}
static void
maybe_add_term(Xapian::Document& doc, const Field& field, const std::string& val)
{
if (field.is_normal_term())
doc.add_term(field.xapian_term());
else if (field.is_indexable_term()) {
add_index_term(doc, field, val);
} else if (field.is_boolean_term())
if (field.is_normal_term()) {
doc.add_term(field.xapian_term(val));
} else if (field.is_boolean_term()) {
doc.add_boolean_term(field.xapian_term(val));
} else if (field.is_indexable_term()) {
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(utf8_flatten(val),1,field.xapian_term());
} else
throw std::logic_error("not a search term");
if (field.id == Field::Id::Tags)
for (auto tag = doc.termlist_begin(); tag != doc.termlist_end(); ++tag)
if ((*tag)[0] == 'X')
g_message("%u: %s", doc.get_docid(), (*tag).c_str());
}
void
@ -63,7 +64,8 @@ Document::add(Field::Id id, const std::string& val)
if (field.is_value())
xdoc_.add_value(field.value_no(), val);
maybe_add_term(xdoc_, field, val);
if (field.is_searchable())
add_search_term(xdoc_, field, val);
}
void
@ -76,8 +78,10 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
if (field.is_value())
xdoc_.add_value(field.value_no(), Mu::join(vals, SepaChar1));
std::for_each(vals.begin(), vals.end(),
[&](const auto& val) { maybe_add_term(xdoc_, field, val); });
if (field.is_searchable())
std::for_each(vals.begin(), vals.end(),
[&](const auto& val) {
add_search_term(xdoc_, field, val); });
}
@ -98,13 +102,19 @@ Document::add(Field::Id id, const Contacts& contacts)
const std::string sepa2(1, SepaChar2);
Xapian::TermGenerator termgen;
termgen.set_document(xdoc_);
for (auto&& contact: contacts) {
if (!contact.field_id || *contact.field_id != id)
continue;
xdoc_.add_term(contact.email);
xdoc_.add_term(field.xapian_term(contact.email));
if (!contact.name.empty())
add_index_term(xdoc_, field, contact.name);
termgen.index_text(utf8_flatten(contact.name), 1,
field.xapian_term());
cvec.emplace_back(contact.email + sepa2 + contact.name);
}
@ -134,26 +144,6 @@ Document::contacts_value(Field::Id id) const noexcept
return contacts;
}
static std::string
integer_to_string(int64_t val)
{
char buf[18];
buf[0] = 'f' + ::snprintf(buf + 1, sizeof(buf) - 1, "%" PRIx64, val);
return buf;
}
static int64_t
string_to_integer(const std::string& str)
{
if (str.empty())
return 0;
int64_t val{};
std::from_chars(str.c_str() + 1, str.c_str() + str.size(), val, 16);
return val;
}
void
Document::add(Field::Id id, int64_t val)
{
@ -167,15 +157,16 @@ Document::add(Field::Id id, int64_t val)
const auto field{field_from_id(id)};
if (field.is_value())
xdoc_.add_value(field.value_no(), integer_to_string(val));
/* terms are not supported for numerical fields */
xdoc_.add_value(field.value_no(), to_lexnum(val));
}
int64_t
Document::integer_value(Field::Id field_id) const noexcept
{
return string_to_integer(string_value(field_id));
if (auto&& v{string_value(field_id)}; v.empty())
return 0;
else
return from_lexnum(v);
}
void
@ -200,10 +191,11 @@ Document::add(Flags flags)
{
constexpr auto field{field_from_id(Field::Id::Flags)};
xdoc_.add_value(field.value_no(), integer_to_string(static_cast<int64_t>(flags)));
xdoc_.add_value(field.value_no(), to_lexnum(static_cast<int64_t>(flags)));
flag_infos_for_each([&](auto&& flag_info) {
if (any_of(flag_info.flag & flags))
xdoc_.add_boolean_term(field.xapian_term(flag_info.shortcut_lower()));
xdoc_.add_boolean_term(field.xapian_term(
flag_info.shortcut_lower()));
});
}
@ -263,10 +255,13 @@ test_bcc()
Contact{"ringo@example.com", "Ringo", Field::Id::Bcc},
}};
doc.add(Field::Id::Bcc, contacts);
auto db = Xapian::InMemory::open();
TempDir tempdir;
auto db = Xapian::WritableDatabase(tempdir.path());
db.add_document(doc.xapian_document());
auto contacts2 = doc.contacts_value(Field::Id::Bcc);
assert_same_contacts(contacts, contacts2);
}
}

View File

@ -199,7 +199,7 @@ public:
*
* @param field_id id of the contacts field to get
*
* @return an integer or 0 if not found.
* @return a contacts list
*/
Contacts contacts_value(Field::Id id) const noexcept;

View File

@ -22,24 +22,32 @@
using namespace Mu;
// Xapian does not like terms much longer than this
constexpr auto MaxTermLength = 240;
std::string
Field::xapian_term(const std::string& s) const
{
return std::string(1U, xapian_prefix()) + s;
}
const auto start{std::string(1U, xapian_prefix())};
if (const auto& size = s.size(); size == 0)
return start;
std::string
Field::xapian_term(std::string_view sv) const
{
return std::string(1U, xapian_prefix()) + std::string{sv};
}
std::string
Field::xapian_term(char c) const
{
return std::string(1U, xapian_prefix()) + c;
}
std::string res{start};
res.reserve(s.size() + 10);
/* slightly optimized common pure-ascii. */
if (G_LIKELY(g_str_is_ascii(s.c_str()))) {
res += s;
for (auto i = 1; res[i]; ++i)
res[i] = g_ascii_tolower(res[i]);
} else
res += utf8_flatten(s);
if (G_UNLIKELY(res.size() > MaxTermLength))
res.erase(MaxTermLength);
return res;
}
/**
* compile-time checks
@ -58,13 +66,26 @@ validate_field_ids()
constexpr bool
validate_field_shortcuts()
{
#ifdef BUILD_TESTS
std::array<size_t, 26> no_dups = {0};
#endif /*BUILD_TESTS*/
for (auto id = 0U; id != Field::id_size(); ++id) {
const auto field_id = static_cast<Field::Id>(id);
const auto shortcut = field_from_id(field_id).shortcut;
if (shortcut != 0 &&
(shortcut < 'a' || shortcut > 'z'))
return false;
#ifdef BUILD_TESTS
if (shortcut != 0) {
if (++no_dups[static_cast<size_t>(shortcut-'a')] > 1) {
g_critical("shortcut '%c' is duplicated",
shortcut);
return false;
}
}
#endif
}
return true;
}
@ -94,8 +115,6 @@ validate_field_flags()
return true;
}
/*
* tests... also build as runtime-tests, so we can get coverage info
*/
@ -135,6 +154,20 @@ test_field_flags()
#ifdef BUILD_TESTS
static void
test_field_from_name()
{
g_assert_true(field_from_name("s")->id == Field::Id::Subject);
g_assert_true(field_from_name("subject")->id == Field::Id::Subject);
g_assert_false(!!field_from_name("8"));
g_assert_false(!!field_from_name(""));
g_assert_true(field_from_name("").value_or(field_from_id(Field::Id::Bcc)).id ==
Field::Id::Bcc);
}
static void
test_xapian_term()
{
@ -146,6 +179,10 @@ test_xapian_term()
assert_equal(field_from_id(Field::Id::From).xapian_term('x'), "Fx");
assert_equal(field_from_id(Field::Id::To).xapian_term("boo"sv), "Tboo");
auto s1 = field_from_id(Field::Id::Subject).xapian_term(std::string(MaxTermLength - 1, 'x'));
auto s2 = field_from_id(Field::Id::Subject).xapian_term(std::string(MaxTermLength, 'x'));
g_assert_cmpuint(s1.length(), ==, s2.length());
}
int
@ -155,6 +192,7 @@ main(int argc, char* argv[])
g_test_add_func("/message/fields/ids", test_ids);
g_test_add_func("/message/fields/shortcuts", test_shortcuts);
g_test_add_func("/message/fields/from-name", test_field_from_name);
g_test_add_func("/message/fields/prefix", test_prefix);
g_test_add_func("/message/fields/xapian-term", test_xapian_term);
g_test_add_func("/message/fields/flags", test_field_flags);

View File

@ -55,7 +55,6 @@ struct Field {
Path, /**< File-system Path */
Subject, /**< Message subject */
To, /**< To: recipient */
Uid, /**< Unique id for message (based on path) */
/*
* string list items...
*/
@ -98,11 +97,12 @@ struct Field {
*
*/
enum struct Type {
String, /**< String */
StringList, /**< List of strings */
ByteSize, /**< Size in bytes */
TimeT, /**< A time_t value */
Integer, /**< An integer */
String, /**< String */
StringList, /**< List of strings */
ContactList, /**< List of contacts */
ByteSize, /**< Size in bytes */
TimeT, /**< A time_t value */
Integer, /**< An integer */
};
constexpr bool is_string() const { return type == Type::String; }
@ -126,33 +126,29 @@ struct Field {
*/
enum struct Flag {
GMime = 1 << 0,
/**< Field retrieved through gmime */
/*
* Different kind of terms; at most one is true,
* and cannot be combined with IsContact. Compile-time enforced.
*/
NormalTerm = 1 << 2,
NormalTerm = 1 << 0,
/**< Field is a searchable term */
BooleanTerm = 1 << 5,
/**< Field is a boolean search-term; wildcards do not work */
IndexableTerm = 1 << 1,
BooleanTerm = 1 << 1,
/**< Field is a boolean search-term (i.e. at most one per message);
* wildcards do not work */
IndexableTerm = 1 << 2,
/**< Field has indexable text as term */
/*
* Contact flag cannot be combined with any of the term flags.
* This is compile-time enforced.
*/
Contact = 1 << 4,
Contact = 1 << 10,
/**< field contains one or more e-mail-addresses */
Value = 1 << 3,
Value = 1 << 11,
/**< Field value is stored (so the literal value can be retrieved) */
DoNotCache = 1 << 6,
DoNotCache = 1 << 20,
/**< don't cache this field in * the MuMsg cache */
Range = 1 << 7
Range = 1 << 21
/**< whether this is a range field (e.g., date, size)*/
};
@ -160,12 +156,9 @@ struct Field {
return (static_cast<int>(some_flag) & static_cast<int>(flags)) != 0;
}
constexpr bool is_gmime() const { return any_of(Flag::GMime); }
constexpr bool is_indexable_term() const { return any_of(Flag::IndexableTerm); }
constexpr bool is_boolean_term() const { return any_of(Flag::BooleanTerm); }
constexpr bool is_normal_term() const { return any_of(Flag::NormalTerm); }
constexpr bool is_searchable() const { return is_indexable_term() ||
is_boolean_term() ||
is_normal_term(); }
@ -174,9 +167,6 @@ struct Field {
constexpr bool is_contact() const { return any_of(Flag::Contact); }
constexpr bool is_range() const { return any_of(Flag::Range); }
constexpr bool do_not_cache() const { return any_of(Flag::DoNotCache); }
/**
* Field members
@ -200,9 +190,21 @@ struct Field {
return shortcut == 0 ? 0 : shortcut - ('a' - 'A');
}
/**
* Get the xapian term; truncated to MaxTermLength and
* utf8-flattened.
*
* @param s
*
* @return the xapian term
*/
std::string xapian_term(const std::string& s="") const;
std::string xapian_term(std::string_view sv) const;
std::string xapian_term(char c) const;
std::string xapian_term(std::string_view sv) const {
return xapian_term(std::string{sv});
}
std::string xapian_term(char c) const {
return xapian_term(std::string(1, c));
}
};
MU_ENABLE_BITOPS(Field::Flag);
@ -216,12 +218,11 @@ static constexpr std::array<Field, Field::id_size()>
// Bcc
{
Field::Id::Bcc,
Field::Type::String,
Field::Type::ContactList,
"bcc",
"Blind carbon-copy recipient",
"bcc:foo@example.com",
'h',
Field::Flag::GMime |
Field::Flag::Contact |
Field::Flag::Value
},
@ -233,8 +234,7 @@ static constexpr std::array<Field, Field::id_size()>
"Message html body",
{},
{},
Field::Flag::GMime |
Field::Flag::DoNotCache
{}
},
// Body
{
@ -244,22 +244,19 @@ static constexpr std::array<Field, Field::id_size()>
"Message plain-text body",
"body:capybara", // example
'b',
Field::Flag::GMime |
Field::Flag::IndexableTerm |
Field::Flag::DoNotCache
Field::Flag::IndexableTerm,
},
// Cc
{
Field::Id::Cc,
Field::Type::String,
Field::Type::ContactList,
"cc",
"Carbon-copy recipient",
"cc:quinn@example.com",
'c',
Field::Flag::GMime |
Field::Flag::Contact |
Field::Flag::Value},
Field::Flag::Value
},
// Embed
{
Field::Id::EmbeddedText,
@ -268,9 +265,8 @@ static constexpr std::array<Field, Field::id_size()>
"Embedded text",
"embed:war OR embed:peace",
'e',
Field::Flag::GMime |
Field::Flag::IndexableTerm |
Field::Flag::DoNotCache},
Field::Flag::IndexableTerm
},
// File
{
Field::Id::File,
@ -279,21 +275,19 @@ static constexpr std::array<Field, Field::id_size()>
"Attachment file name",
"file:/image\\.*.jpg/",
'j',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::DoNotCache},
Field::Flag::NormalTerm
},
// From
{
Field::Id::From,
Field::Type::String,
Field::Type::ContactList,
"from",
"Message sender",
"from:jimbo",
'f',
Field::Flag::GMime |
Field::Flag::Contact |
Field::Flag::Value},
Field::Flag::Value
},
// Maildir
{
Field::Id::Maildir,
@ -302,9 +296,9 @@ static constexpr std::array<Field, Field::id_size()>
"Maildir path for message",
"maildir:/private/archive",
'm',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::Value},
Field::Flag::BooleanTerm |
Field::Flag::Value
},
// MIME
{
Field::Id::Mime,
@ -313,7 +307,8 @@ static constexpr std::array<Field, Field::id_size()>
"Attachment MIME-type",
"mime:image/jpeg",
'y',
Field::Flag::NormalTerm},
Field::Flag::NormalTerm
},
// Message-ID
{
Field::Id::MessageId,
@ -322,9 +317,9 @@ static constexpr std::array<Field, Field::id_size()>
"Attachment MIME-type",
"msgid:abc@123",
'i',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::Value},
Field::Flag::BooleanTerm |
Field::Flag::Value
},
// Path
{
Field::Id::Path,
@ -332,11 +327,10 @@ static constexpr std::array<Field, Field::id_size()>
"path",
"File system path to message",
{},
'p',
Field::Flag::GMime |
'l',
Field::Flag::BooleanTerm |
Field::Flag::Value},
Field::Flag::Value
},
// Subject
{
Field::Id::Subject,
@ -345,32 +339,20 @@ static constexpr std::array<Field, Field::id_size()>
"Message subject",
"subject:wombat",
's',
Field::Flag::GMime |
Field::Flag::Value |
Field::Flag::IndexableTerm},
Field::Flag::IndexableTerm
},
// To
{
Field::Id::To,
Field::Type::String,
Field::Type::ContactList,
"to",
"Message recipient",
"to:flimflam@example.com",
't',
Field::Flag::GMime |
Field::Flag::Contact |
Field::Flag::Value
},
// UID (internal)
{
Field::Id::Uid,
Field::Type::String,
"uid",
"Message recipient",
{},
'u',
Field::Flag::NormalTerm},
// References
{
Field::Id::References,
@ -379,7 +361,6 @@ static constexpr std::array<Field, Field::id_size()>
"Message references to other messages",
{},
'r',
Field::Flag::GMime |
Field::Flag::Value
},
// Tags
@ -390,7 +371,6 @@ static constexpr std::array<Field, Field::id_size()>
"Message tags",
"tag:projectx",
'x',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::Value
},
@ -402,7 +382,6 @@ static constexpr std::array<Field, Field::id_size()>
"Message date",
"date:20220101..20220505",
'd',
Field::Flag::GMime |
Field::Flag::Value |
Field::Flag::Range
},
@ -414,7 +393,6 @@ static constexpr std::array<Field, Field::id_size()>
"Message properties",
"flag:unread",
'g',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::Value
},
@ -426,8 +404,7 @@ static constexpr std::array<Field, Field::id_size()>
"Priority",
"prio:high",
'p',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::BooleanTerm |
Field::Flag::Value
},
// Size
@ -438,7 +415,6 @@ static constexpr std::array<Field, Field::id_size()>
"Message size in bytes",
"size:1M..5M",
'z',
Field::Flag::GMime |
Field::Flag::Value |
Field::Flag::Range
},
@ -450,8 +426,7 @@ static constexpr std::array<Field, Field::id_size()>
"Mailing list (List-Id:)",
"list:mu-discuss.googlegroups.com",
'v',
Field::Flag::GMime |
Field::Flag::NormalTerm |
Field::Flag::BooleanTerm |
Field::Flag::Value
},
// ThreadId
@ -462,7 +437,8 @@ static constexpr std::array<Field, Field::id_size()>
"Thread a message belongs to",
{},
'w',
Field::Flag::NormalTerm
Field::Flag::BooleanTerm |
Field::Flag::Value
},
}};
@ -489,7 +465,7 @@ field_from_id(Field::Id id)
* @param func some callable
*/
template <typename Func>
void field_for_each(Func&& func) {
constexpr void field_for_each(Func&& func) {
for (const auto& field: Fields)
func(field);
}
@ -502,7 +478,7 @@ void field_for_each(Func&& func) {
* @return a message-field id, or nullopt if not found.
*/
template <typename Pred>
Option<Field> field_find_if(Pred&& pred) {
constexpr Option<Field> field_find_if(Pred&& pred) {
for (auto&& field: Fields)
if (pred(field))
return field;
@ -548,6 +524,5 @@ Option<Field> field_from_number(size_t id)
return field_from_id(static_cast<Field::Id>(id));
}
} // namespace Mu
#endif /* MU_FIELDS_HH__ */

View File

@ -21,7 +21,6 @@
#define MU_FLAGS_HH__
#include <algorithm>
#include <optional>
#include <string_view>
#include <array>
#include <utils/mu-utils.hh>