From 9a8741f0ddc6d299b63720643d517edd20031af4 Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Thu, 28 Apr 2022 22:53:31 +0300 Subject: [PATCH] message:document/fields: update and tie down Update many of the field flags; remove obsolete ones. Ensure they are handled correctly in mu-document --- lib/message/mu-document.cc | 89 +++++++++++----------- lib/message/mu-document.hh | 2 +- lib/message/mu-fields.cc | 66 ++++++++++++---- lib/message/mu-fields.hh | 149 +++++++++++++++---------------------- lib/message/mu-flags.hh | 1 - 5 files changed, 157 insertions(+), 150 deletions(-) diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index c8ab7e29..a2832207 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -36,23 +36,24 @@ constexpr char SepaChar1 = 0xfe; constexpr char SepaChar2 = 0xff; static void -add_index_term(Xapian::Document& doc, const Field& field, const std::string& val) +add_search_term(Xapian::Document& doc, const Field& field, const std::string& val) { - std::string flatval{utf8_flatten(val)}; - Xapian::TermGenerator termgen; - termgen.set_document(doc); - termgen.index_text(flatval); -} - -static void -maybe_add_term(Xapian::Document& doc, const Field& field, const std::string& val) -{ - if (field.is_normal_term()) - doc.add_term(field.xapian_term()); - else if (field.is_indexable_term()) { - add_index_term(doc, field, val); - } else if (field.is_boolean_term()) + if (field.is_normal_term()) { + doc.add_term(field.xapian_term(val)); + } else if (field.is_boolean_term()) { doc.add_boolean_term(field.xapian_term(val)); + } else if (field.is_indexable_term()) { + Xapian::TermGenerator termgen; + termgen.set_document(doc); + termgen.index_text(utf8_flatten(val),1,field.xapian_term()); + } else + throw std::logic_error("not a search term"); + + + if (field.id == Field::Id::Tags) + for (auto tag = doc.termlist_begin(); tag != doc.termlist_end(); ++tag) + if ((*tag)[0] == 'X') + g_message("%u: %s", doc.get_docid(), (*tag).c_str()); } void @@ -63,7 +64,8 @@ Document::add(Field::Id id, const std::string& val) if (field.is_value()) xdoc_.add_value(field.value_no(), val); - maybe_add_term(xdoc_, field, val); + if (field.is_searchable()) + add_search_term(xdoc_, field, val); } void @@ -76,8 +78,10 @@ Document::add(Field::Id id, const std::vector& vals) if (field.is_value()) xdoc_.add_value(field.value_no(), Mu::join(vals, SepaChar1)); - std::for_each(vals.begin(), vals.end(), - [&](const auto& val) { maybe_add_term(xdoc_, field, val); }); + if (field.is_searchable()) + std::for_each(vals.begin(), vals.end(), + [&](const auto& val) { + add_search_term(xdoc_, field, val); }); } @@ -98,13 +102,19 @@ Document::add(Field::Id id, const Contacts& contacts) const std::string sepa2(1, SepaChar2); + Xapian::TermGenerator termgen; + termgen.set_document(xdoc_); + for (auto&& contact: contacts) { + if (!contact.field_id || *contact.field_id != id) continue; - xdoc_.add_term(contact.email); + xdoc_.add_term(field.xapian_term(contact.email)); + if (!contact.name.empty()) - add_index_term(xdoc_, field, contact.name); + termgen.index_text(utf8_flatten(contact.name), 1, + field.xapian_term()); cvec.emplace_back(contact.email + sepa2 + contact.name); } @@ -134,26 +144,6 @@ Document::contacts_value(Field::Id id) const noexcept return contacts; } -static std::string -integer_to_string(int64_t val) -{ - char buf[18]; - buf[0] = 'f' + ::snprintf(buf + 1, sizeof(buf) - 1, "%" PRIx64, val); - return buf; -} - -static int64_t -string_to_integer(const std::string& str) -{ - if (str.empty()) - return 0; - - int64_t val{}; - std::from_chars(str.c_str() + 1, str.c_str() + str.size(), val, 16); - - return val; -} - void Document::add(Field::Id id, int64_t val) { @@ -167,15 +157,16 @@ Document::add(Field::Id id, int64_t val) const auto field{field_from_id(id)}; if (field.is_value()) - xdoc_.add_value(field.value_no(), integer_to_string(val)); - - /* terms are not supported for numerical fields */ + xdoc_.add_value(field.value_no(), to_lexnum(val)); } int64_t Document::integer_value(Field::Id field_id) const noexcept { - return string_to_integer(string_value(field_id)); + if (auto&& v{string_value(field_id)}; v.empty()) + return 0; + else + return from_lexnum(v); } void @@ -200,10 +191,11 @@ Document::add(Flags flags) { constexpr auto field{field_from_id(Field::Id::Flags)}; - xdoc_.add_value(field.value_no(), integer_to_string(static_cast(flags))); + xdoc_.add_value(field.value_no(), to_lexnum(static_cast(flags))); flag_infos_for_each([&](auto&& flag_info) { if (any_of(flag_info.flag & flags)) - xdoc_.add_boolean_term(field.xapian_term(flag_info.shortcut_lower())); + xdoc_.add_boolean_term(field.xapian_term( + flag_info.shortcut_lower())); }); } @@ -263,10 +255,13 @@ test_bcc() Contact{"ringo@example.com", "Ringo", Field::Id::Bcc}, }}; doc.add(Field::Id::Bcc, contacts); - auto db = Xapian::InMemory::open(); + TempDir tempdir; + auto db = Xapian::WritableDatabase(tempdir.path()); db.add_document(doc.xapian_document()); + auto contacts2 = doc.contacts_value(Field::Id::Bcc); + assert_same_contacts(contacts, contacts2); } } diff --git a/lib/message/mu-document.hh b/lib/message/mu-document.hh index aa9319bf..d1078e0e 100644 --- a/lib/message/mu-document.hh +++ b/lib/message/mu-document.hh @@ -199,7 +199,7 @@ public: * * @param field_id id of the contacts field to get * - * @return an integer or 0 if not found. + * @return a contacts list */ Contacts contacts_value(Field::Id id) const noexcept; diff --git a/lib/message/mu-fields.cc b/lib/message/mu-fields.cc index 3d8e5bb9..6fb73ce7 100644 --- a/lib/message/mu-fields.cc +++ b/lib/message/mu-fields.cc @@ -22,24 +22,32 @@ using namespace Mu; +// Xapian does not like terms much longer than this +constexpr auto MaxTermLength = 240; + std::string Field::xapian_term(const std::string& s) const { - return std::string(1U, xapian_prefix()) + s; -} + const auto start{std::string(1U, xapian_prefix())}; + if (const auto& size = s.size(); size == 0) + return start; -std::string -Field::xapian_term(std::string_view sv) const -{ - return std::string(1U, xapian_prefix()) + std::string{sv}; -} -std::string -Field::xapian_term(char c) const -{ - return std::string(1U, xapian_prefix()) + c; -} + std::string res{start}; + res.reserve(s.size() + 10); + /* slightly optimized common pure-ascii. */ + if (G_LIKELY(g_str_is_ascii(s.c_str()))) { + res += s; + for (auto i = 1; res[i]; ++i) + res[i] = g_ascii_tolower(res[i]); + } else + res += utf8_flatten(s); + if (G_UNLIKELY(res.size() > MaxTermLength)) + res.erase(MaxTermLength); + + return res; +} /** * compile-time checks @@ -58,13 +66,26 @@ validate_field_ids() constexpr bool validate_field_shortcuts() { +#ifdef BUILD_TESTS + std::array no_dups = {0}; +#endif /*BUILD_TESTS*/ for (auto id = 0U; id != Field::id_size(); ++id) { const auto field_id = static_cast(id); const auto shortcut = field_from_id(field_id).shortcut; if (shortcut != 0 && (shortcut < 'a' || shortcut > 'z')) return false; +#ifdef BUILD_TESTS + if (shortcut != 0) { + if (++no_dups[static_cast(shortcut-'a')] > 1) { + g_critical("shortcut '%c' is duplicated", + shortcut); + return false; + } + } +#endif } + return true; } @@ -94,8 +115,6 @@ validate_field_flags() return true; } - - /* * tests... also build as runtime-tests, so we can get coverage info */ @@ -135,6 +154,20 @@ test_field_flags() #ifdef BUILD_TESTS + +static void +test_field_from_name() +{ + g_assert_true(field_from_name("s")->id == Field::Id::Subject); + g_assert_true(field_from_name("subject")->id == Field::Id::Subject); + g_assert_false(!!field_from_name("8")); + g_assert_false(!!field_from_name("")); + + g_assert_true(field_from_name("").value_or(field_from_id(Field::Id::Bcc)).id == + Field::Id::Bcc); +} + + static void test_xapian_term() { @@ -146,6 +179,10 @@ test_xapian_term() assert_equal(field_from_id(Field::Id::From).xapian_term('x'), "Fx"); assert_equal(field_from_id(Field::Id::To).xapian_term("boo"sv), "Tboo"); + + auto s1 = field_from_id(Field::Id::Subject).xapian_term(std::string(MaxTermLength - 1, 'x')); + auto s2 = field_from_id(Field::Id::Subject).xapian_term(std::string(MaxTermLength, 'x')); + g_assert_cmpuint(s1.length(), ==, s2.length()); } int @@ -155,6 +192,7 @@ main(int argc, char* argv[]) g_test_add_func("/message/fields/ids", test_ids); g_test_add_func("/message/fields/shortcuts", test_shortcuts); + g_test_add_func("/message/fields/from-name", test_field_from_name); g_test_add_func("/message/fields/prefix", test_prefix); g_test_add_func("/message/fields/xapian-term", test_xapian_term); g_test_add_func("/message/fields/flags", test_field_flags); diff --git a/lib/message/mu-fields.hh b/lib/message/mu-fields.hh index a2fb5138..6a768b80 100644 --- a/lib/message/mu-fields.hh +++ b/lib/message/mu-fields.hh @@ -55,7 +55,6 @@ struct Field { Path, /**< File-system Path */ Subject, /**< Message subject */ To, /**< To: recipient */ - Uid, /**< Unique id for message (based on path) */ /* * string list items... */ @@ -98,11 +97,12 @@ struct Field { * */ enum struct Type { - String, /**< String */ - StringList, /**< List of strings */ - ByteSize, /**< Size in bytes */ - TimeT, /**< A time_t value */ - Integer, /**< An integer */ + String, /**< String */ + StringList, /**< List of strings */ + ContactList, /**< List of contacts */ + ByteSize, /**< Size in bytes */ + TimeT, /**< A time_t value */ + Integer, /**< An integer */ }; constexpr bool is_string() const { return type == Type::String; } @@ -126,33 +126,29 @@ struct Field { */ enum struct Flag { - GMime = 1 << 0, - /**< Field retrieved through gmime */ - /* * Different kind of terms; at most one is true, * and cannot be combined with IsContact. Compile-time enforced. */ - NormalTerm = 1 << 2, + NormalTerm = 1 << 0, /**< Field is a searchable term */ - BooleanTerm = 1 << 5, - /**< Field is a boolean search-term; wildcards do not work */ - IndexableTerm = 1 << 1, + BooleanTerm = 1 << 1, + /**< Field is a boolean search-term (i.e. at most one per message); + * wildcards do not work */ + IndexableTerm = 1 << 2, /**< Field has indexable text as term */ - /* * Contact flag cannot be combined with any of the term flags. * This is compile-time enforced. */ - Contact = 1 << 4, + Contact = 1 << 10, /**< field contains one or more e-mail-addresses */ - - Value = 1 << 3, + Value = 1 << 11, /**< Field value is stored (so the literal value can be retrieved) */ - DoNotCache = 1 << 6, + DoNotCache = 1 << 20, /**< don't cache this field in * the MuMsg cache */ - Range = 1 << 7 + Range = 1 << 21 /**< whether this is a range field (e.g., date, size)*/ }; @@ -160,12 +156,9 @@ struct Field { return (static_cast(some_flag) & static_cast(flags)) != 0; } - constexpr bool is_gmime() const { return any_of(Flag::GMime); } - constexpr bool is_indexable_term() const { return any_of(Flag::IndexableTerm); } constexpr bool is_boolean_term() const { return any_of(Flag::BooleanTerm); } constexpr bool is_normal_term() const { return any_of(Flag::NormalTerm); } - constexpr bool is_searchable() const { return is_indexable_term() || is_boolean_term() || is_normal_term(); } @@ -174,9 +167,6 @@ struct Field { constexpr bool is_contact() const { return any_of(Flag::Contact); } constexpr bool is_range() const { return any_of(Flag::Range); } - constexpr bool do_not_cache() const { return any_of(Flag::DoNotCache); } - - /** * Field members @@ -200,9 +190,21 @@ struct Field { return shortcut == 0 ? 0 : shortcut - ('a' - 'A'); } + /** + * Get the xapian term; truncated to MaxTermLength and + * utf8-flattened. + * + * @param s + * + * @return the xapian term + */ std::string xapian_term(const std::string& s="") const; - std::string xapian_term(std::string_view sv) const; - std::string xapian_term(char c) const; + std::string xapian_term(std::string_view sv) const { + return xapian_term(std::string{sv}); + } + std::string xapian_term(char c) const { + return xapian_term(std::string(1, c)); + } }; MU_ENABLE_BITOPS(Field::Flag); @@ -216,12 +218,11 @@ static constexpr std::array // Bcc { Field::Id::Bcc, - Field::Type::String, + Field::Type::ContactList, "bcc", "Blind carbon-copy recipient", "bcc:foo@example.com", 'h', - Field::Flag::GMime | Field::Flag::Contact | Field::Flag::Value }, @@ -233,8 +234,7 @@ static constexpr std::array "Message html body", {}, {}, - Field::Flag::GMime | - Field::Flag::DoNotCache + {} }, // Body { @@ -244,22 +244,19 @@ static constexpr std::array "Message plain-text body", "body:capybara", // example 'b', - Field::Flag::GMime | - Field::Flag::IndexableTerm | - Field::Flag::DoNotCache + Field::Flag::IndexableTerm, }, // Cc { Field::Id::Cc, - Field::Type::String, + Field::Type::ContactList, "cc", "Carbon-copy recipient", "cc:quinn@example.com", 'c', - Field::Flag::GMime | Field::Flag::Contact | - Field::Flag::Value}, - + Field::Flag::Value + }, // Embed { Field::Id::EmbeddedText, @@ -268,9 +265,8 @@ static constexpr std::array "Embedded text", "embed:war OR embed:peace", 'e', - Field::Flag::GMime | - Field::Flag::IndexableTerm | - Field::Flag::DoNotCache}, + Field::Flag::IndexableTerm + }, // File { Field::Id::File, @@ -279,21 +275,19 @@ static constexpr std::array "Attachment file name", "file:/image\\.*.jpg/", 'j', - Field::Flag::GMime | - Field::Flag::NormalTerm | - Field::Flag::DoNotCache}, - + Field::Flag::NormalTerm + }, // From { Field::Id::From, - Field::Type::String, + Field::Type::ContactList, "from", "Message sender", "from:jimbo", 'f', - Field::Flag::GMime | Field::Flag::Contact | - Field::Flag::Value}, + Field::Flag::Value + }, // Maildir { Field::Id::Maildir, @@ -302,9 +296,9 @@ static constexpr std::array "Maildir path for message", "maildir:/private/archive", 'm', - Field::Flag::GMime | - Field::Flag::NormalTerm | - Field::Flag::Value}, + Field::Flag::BooleanTerm | + Field::Flag::Value + }, // MIME { Field::Id::Mime, @@ -313,7 +307,8 @@ static constexpr std::array "Attachment MIME-type", "mime:image/jpeg", 'y', - Field::Flag::NormalTerm}, + Field::Flag::NormalTerm + }, // Message-ID { Field::Id::MessageId, @@ -322,9 +317,9 @@ static constexpr std::array "Attachment MIME-type", "msgid:abc@123", 'i', - Field::Flag::GMime | - Field::Flag::NormalTerm | - Field::Flag::Value}, + Field::Flag::BooleanTerm | + Field::Flag::Value + }, // Path { Field::Id::Path, @@ -332,11 +327,10 @@ static constexpr std::array "path", "File system path to message", {}, - 'p', - Field::Flag::GMime | + 'l', Field::Flag::BooleanTerm | - Field::Flag::Value}, - + Field::Flag::Value + }, // Subject { Field::Id::Subject, @@ -345,32 +339,20 @@ static constexpr std::array "Message subject", "subject:wombat", 's', - Field::Flag::GMime | Field::Flag::Value | - Field::Flag::IndexableTerm}, - + Field::Flag::IndexableTerm + }, // To { Field::Id::To, - Field::Type::String, + Field::Type::ContactList, "to", "Message recipient", "to:flimflam@example.com", 't', - Field::Flag::GMime | Field::Flag::Contact | Field::Flag::Value }, - // UID (internal) - { - Field::Id::Uid, - Field::Type::String, - "uid", - "Message recipient", - {}, - 'u', - Field::Flag::NormalTerm}, - // References { Field::Id::References, @@ -379,7 +361,6 @@ static constexpr std::array "Message references to other messages", {}, 'r', - Field::Flag::GMime | Field::Flag::Value }, // Tags @@ -390,7 +371,6 @@ static constexpr std::array "Message tags", "tag:projectx", 'x', - Field::Flag::GMime | Field::Flag::NormalTerm | Field::Flag::Value }, @@ -402,7 +382,6 @@ static constexpr std::array "Message date", "date:20220101..20220505", 'd', - Field::Flag::GMime | Field::Flag::Value | Field::Flag::Range }, @@ -414,7 +393,6 @@ static constexpr std::array "Message properties", "flag:unread", 'g', - Field::Flag::GMime | Field::Flag::NormalTerm | Field::Flag::Value }, @@ -426,8 +404,7 @@ static constexpr std::array "Priority", "prio:high", 'p', - Field::Flag::GMime | - Field::Flag::NormalTerm | + Field::Flag::BooleanTerm | Field::Flag::Value }, // Size @@ -438,7 +415,6 @@ static constexpr std::array "Message size in bytes", "size:1M..5M", 'z', - Field::Flag::GMime | Field::Flag::Value | Field::Flag::Range }, @@ -450,8 +426,7 @@ static constexpr std::array "Mailing list (List-Id:)", "list:mu-discuss.googlegroups.com", 'v', - Field::Flag::GMime | - Field::Flag::NormalTerm | + Field::Flag::BooleanTerm | Field::Flag::Value }, // ThreadId @@ -462,7 +437,8 @@ static constexpr std::array "Thread a message belongs to", {}, 'w', - Field::Flag::NormalTerm + Field::Flag::BooleanTerm | + Field::Flag::Value }, }}; @@ -489,7 +465,7 @@ field_from_id(Field::Id id) * @param func some callable */ template -void field_for_each(Func&& func) { +constexpr void field_for_each(Func&& func) { for (const auto& field: Fields) func(field); } @@ -502,7 +478,7 @@ void field_for_each(Func&& func) { * @return a message-field id, or nullopt if not found. */ template -Option field_find_if(Pred&& pred) { +constexpr Option field_find_if(Pred&& pred) { for (auto&& field: Fields) if (pred(field)) return field; @@ -548,6 +524,5 @@ Option field_from_number(size_t id) return field_from_id(static_cast(id)); } - } // namespace Mu #endif /* MU_FIELDS_HH__ */ diff --git a/lib/message/mu-flags.hh b/lib/message/mu-flags.hh index dfc68afb..d4841406 100644 --- a/lib/message/mu-flags.hh +++ b/lib/message/mu-flags.hh @@ -21,7 +21,6 @@ #define MU_FLAGS_HH__ #include -#include #include #include #include