From b795242d5acb193475926aec1a1cb987443cff52 Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Sun, 23 Jul 2023 14:46:11 +0300 Subject: [PATCH] message: use html-to-text scraper for html parts We were dumping the HTML-parts as-is in the Xapian indexer; however, it's better to remove the html decoration first, and just pass the text. We use the new built-in html->text scraper for that. --- NEWS.org | 11 ++++++++--- lib/message/mu-document.cc | 17 ++--------------- lib/message/mu-fields.cc | 1 - lib/message/mu-fields.hh | 19 ++----------------- lib/message/mu-message.cc | 31 +++++++++++++++---------------- lib/message/mu-mime-object.cc | 3 +-- mu/tests/test-mu-cmd.cc | 18 +++--------------- 7 files changed, 31 insertions(+), 69 deletions(-) diff --git a/NEWS.org b/NEWS.org index 2a29aa2c..08e26a24 100644 --- a/NEWS.org +++ b/NEWS.org @@ -19,9 +19,14 @@ - what used to be the ~mu fields~ command has been merged into ~mu info~; i.e., ~mu fields~ is now ~mu info fields~. - - ~mu view~ gained ~--format=html~ for it to output the HTML body of the message - rather than the (default) plain-text body. See its updated manpage for - details. + - ~mu view~ gained ~--format=html~ which compels it to output the HTML body of + the message rather than the (default) plain-text body. See its updated + manpage for details. + + - when encountering an HTML message part during indexing, previously (i.e., + ~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this + is now improved by employing a html->text scraper which extracts the + human-readable text from the html. - experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux distros), ~mu~ will try to detect the language of the body of e-mail diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index 3108ddf6..4e4d9d24 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va throw std::logic_error("not a search term"); } -/* hack... import html text as if it were plain text. */ -static void -add_body_html(Xapian::Document& doc, const Field& field, const std::string& val) -{ - static Field body_field = field_from_id(Field::Id::BodyText); - - Xapian::TermGenerator termgen; - termgen.set_document(doc); - termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term()); -} - void Document::add(Field::Id id, const std::string& val) { @@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val) if (field.is_searchable()) add_search_term(xdoc_, field, val); - else if (id == Field::Id::XBodyHtml) - add_body_html(xdoc_, field, val); - if (field.include_in_sexp()) { + + if (field.include_in_sexp()) put_prop(field, val); - } } void diff --git a/lib/message/mu-fields.cc b/lib/message/mu-fields.cc index 8515636e..a8872101 100644 --- a/lib/message/mu-fields.cc +++ b/lib/message/mu-fields.cc @@ -139,7 +139,6 @@ static void test_prefix() { static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S'); - static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0); } [[maybe_unused]] diff --git a/lib/message/mu-fields.hh b/lib/message/mu-fields.hh index 234b4000..5aebb9e2 100644 --- a/lib/message/mu-fields.hh +++ b/lib/message/mu-fields.hh @@ -65,12 +65,8 @@ struct Field { Tags, /**< Message Tags */ ThreadId, /**< Thread Id */ To, /**< To: recipient */ - /* - * - */ - XBodyHtml, /**< HTML Body */ - - _count_ /**< Number of FieldIds */ + // + _count_ /**< Number of Ids */ }; /** @@ -458,17 +454,6 @@ static constexpr std::array Field::Flag::IncludeInSexp | Field::Flag::IndexableTerm, }, - - /* internal */ - { - Field::Id::XBodyHtml, - Field::Type::String, - "htmlbody", {}, - "Message html body", - {}, - {}, - Field::Flag::Internal - }, }}; /* diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index 114ad9c7..c026dfcb 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg) } static void -append_text(Option& str, Option app) +append_text(Option& str, Option&& app) { - if (!str) - str = app; - else if (app) + if (!str && app) + str = std::move(*app); + else if (str && app) str.value() += app.value(); } @@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part, return; submsg->for_each([&](auto&& parent, auto&& child_obj) { - - /* XXX: we only handle one level */ - + /* NOTE: we only handle one level; ideally, we'd apply the whole + parsing machinery recursively; so this a little crude. */ if (!child_obj.is_part()) return; - - const auto ctype{child_obj.content_type()}; - if (!ctype || !ctype->is_type("text", "*")) + if (const auto ctype{child_obj.content_type()}; !ctype) return; - - append_text(info.embedded, MimePart{child_obj}.to_string()); + else if (ctype->is_type("text", "plain")) + append_text(info.embedded, MimePart{child_obj}.to_string()); + else if (ctype->is_type("text", "html")) { + if (auto&& str{MimePart{child_obj}.to_string()}; str) + append_text(info.embedded, html_to_text(*str)); + } }); } @@ -662,6 +663,8 @@ fill_document(Message::Private& priv) break; case Field::Id::BodyText: doc.add(field.id, priv.body_txt); + if (priv.body_html) + doc.add(field.id, html_to_text(*priv.body_html)); break; case Field::Id::Cc: doc.add(field.id, mime_msg.contacts(Contact::Type::Cc)); @@ -725,10 +728,6 @@ fill_document(Message::Private& priv) case Field::Id::To: doc.add(field.id, mime_msg.contacts(Contact::Type::To)); break; - /* internal fields */ - case Field::Id::XBodyHtml: - doc.add(field.id, priv.body_html); - break; /* LCOV_EXCL_START */ case Field::Id::_count_: default: diff --git a/lib/message/mu-mime-object.cc b/lib/message/mu-mime-object.cc index f6f4bcef..a75da5be 100644 --- a/lib/message/mu-mime-object.cc +++ b/lib/message/mu-mime-object.cc @@ -535,8 +535,7 @@ MimePart::to_string() const noexcept if (bytes < 0) return Nothing; - buffer.data()[bytes]='\0'; - buffer.resize(buflen); + buffer.resize(bytes + 1); return buffer; } diff --git a/mu/tests/test-mu-cmd.cc b/mu/tests/test-mu-cmd.cc index b5d2621d..8dd09112 100644 --- a/mu/tests/test-mu-cmd.cc +++ b/mu/tests/test-mu-cmd.cc @@ -153,22 +153,10 @@ static void test_mu_find_02(void) { /* when matching html as if it were text, - * 'bull' is also matched in arto.eml, • - */ - // search("bull", 1); - // search("bull m:foo", 0); - // search("bull m:/foo", 1); - // search("bull m:/Foo", 1); - // search("bull flag:attach", 1); - // search("bull flag:a", 1); - - search("bull", 2); - search("bull m:foo", 0); - search("bull m:/foo", 2); - search("bull m:/Foo", 2); - search("bull flag:attach", 1); - search("bull flag:a", 1); + * 'bull' is also matched in arto.eml, • however, + * we don't do that anymore! */ + search("bull", 1); search("g:x", 0); search("flag:encrypted", 0);