message: use html-to-text scraper for html parts

We were dumping the HTML-parts as-is in the Xapian indexer; however,
it's better to remove the html decoration first, and just pass the text.

We use the new built-in html->text scraper for that.
This commit is contained in:
Dirk-Jan C. Binnema 2023-07-23 14:46:11 +03:00
parent 56b8fad89e
commit b795242d5a
7 changed files with 31 additions and 69 deletions

View File

@ -19,9 +19,14 @@
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
~mu fields~ is now ~mu info fields~.
- ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
rather than the (default) plain-text body. See its updated manpage for
details.
- ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
the message rather than the (default) plain-text body. See its updated
manpage for details.
- when encountering an HTML message part during indexing, previously (i.e.,
~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
is now improved by employing a html->text scraper which extracts the
human-readable text from the html.
- experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
distros), ~mu~ will try to detect the language of the body of e-mail

View File

@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
throw std::logic_error("not a search term");
}
/* hack... import html text as if it were plain text. */
static void
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
{
static Field body_field = field_from_id(Field::Id::BodyText);
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
}
void
Document::add(Field::Id id, const std::string& val)
{
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)
if (field.is_searchable())
add_search_term(xdoc_, field, val);
else if (id == Field::Id::XBodyHtml)
add_body_html(xdoc_, field, val);
if (field.include_in_sexp()) {
if (field.include_in_sexp())
put_prop(field, val);
}
}
void

View File

@ -139,7 +139,6 @@ static void
test_prefix()
{
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
}
[[maybe_unused]]

View File

@ -65,12 +65,8 @@ struct Field {
Tags, /**< Message Tags */
ThreadId, /**< Thread Id */
To, /**< To: recipient */
/*
* <private>
*/
XBodyHtml, /**< HTML Body */
_count_ /**< Number of FieldIds */
//
_count_ /**< Number of Ids */
};
/**
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm,
},
/* internal */
{
Field::Id::XBodyHtml,
Field::Type::String,
"htmlbody", {},
"Message html body",
{},
{},
Field::Flag::Internal
},
}};
/*

View File

@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
}
static void
append_text(Option<std::string>& str, Option<std::string> app)
append_text(Option<std::string>& str, Option<std::string>&& app)
{
if (!str)
str = app;
else if (app)
if (!str && app)
str = std::move(*app);
else if (str && app)
str.value() += app.value();
}
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
return;
submsg->for_each([&](auto&& parent, auto&& child_obj) {
/* XXX: we only handle one level */
/* NOTE: we only handle one level; ideally, we'd apply the whole
parsing machinery recursively; so this a little crude. */
if (!child_obj.is_part())
return;
const auto ctype{child_obj.content_type()};
if (!ctype || !ctype->is_type("text", "*"))
if (const auto ctype{child_obj.content_type()}; !ctype)
return;
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "plain"))
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "html")) {
if (auto&& str{MimePart{child_obj}.to_string()}; str)
append_text(info.embedded, html_to_text(*str));
}
});
}
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
break;
case Field::Id::BodyText:
doc.add(field.id, priv.body_txt);
if (priv.body_html)
doc.add(field.id, html_to_text(*priv.body_html));
break;
case Field::Id::Cc:
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
case Field::Id::To:
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
break;
/* internal fields */
case Field::Id::XBodyHtml:
doc.add(field.id, priv.body_html);
break;
/* LCOV_EXCL_START */
case Field::Id::_count_:
default:

View File

@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
if (bytes < 0)
return Nothing;
buffer.data()[bytes]='\0';
buffer.resize(buflen);
buffer.resize(bytes + 1);
return buffer;
}

View File

@ -153,22 +153,10 @@ static void
test_mu_find_02(void)
{
/* when matching html as if it were text,
* 'bull' is also matched in arto.eml, &bull;
*/
// search("bull", 1);
// search("bull m:foo", 0);
// search("bull m:/foo", 1);
// search("bull m:/Foo", 1);
// search("bull flag:attach", 1);
// search("bull flag:a", 1);
search("bull", 2);
search("bull m:foo", 0);
search("bull m:/foo", 2);
search("bull m:/Foo", 2);
search("bull flag:attach", 1);
search("bull flag:a", 1);
* 'bull' is also matched in arto.eml, &bull; however,
* we don't do that anymore! */
search("bull", 1);
search("g:x", 0);
search("flag:encrypted", 0);