mirror of https://github.com/djcb/mu.git
message: use html-to-text scraper for html parts
We were dumping the HTML-parts as-is in the Xapian indexer; however, it's better to remove the html decoration first, and just pass the text. We use the new built-in html->text scraper for that.
This commit is contained in:
parent
56b8fad89e
commit
b795242d5a
11
NEWS.org
11
NEWS.org
|
@ -19,9 +19,14 @@
|
|||
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
|
||||
~mu fields~ is now ~mu info fields~.
|
||||
|
||||
- ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
|
||||
rather than the (default) plain-text body. See its updated manpage for
|
||||
details.
|
||||
- ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
|
||||
the message rather than the (default) plain-text body. See its updated
|
||||
manpage for details.
|
||||
|
||||
- when encountering an HTML message part during indexing, previously (i.e.,
|
||||
~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
|
||||
is now improved by employing a html->text scraper which extracts the
|
||||
human-readable text from the html.
|
||||
|
||||
- experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
|
||||
distros), ~mu~ will try to detect the language of the body of e-mail
|
||||
|
|
|
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
|
|||
throw std::logic_error("not a search term");
|
||||
}
|
||||
|
||||
/* hack... import html text as if it were plain text. */
|
||||
static void
|
||||
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
|
||||
{
|
||||
static Field body_field = field_from_id(Field::Id::BodyText);
|
||||
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(doc);
|
||||
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
|
||||
}
|
||||
|
||||
void
|
||||
Document::add(Field::Id id, const std::string& val)
|
||||
{
|
||||
|
@ -100,12 +89,10 @@ Document::add(Field::Id id, const std::string& val)
|
|||
|
||||
if (field.is_searchable())
|
||||
add_search_term(xdoc_, field, val);
|
||||
else if (id == Field::Id::XBodyHtml)
|
||||
add_body_html(xdoc_, field, val);
|
||||
if (field.include_in_sexp()) {
|
||||
|
||||
if (field.include_in_sexp())
|
||||
put_prop(field, val);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Document::add(Field::Id id, const std::vector<std::string>& vals)
|
||||
|
|
|
@ -139,7 +139,6 @@ static void
|
|||
test_prefix()
|
||||
{
|
||||
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
|
||||
static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
|
||||
}
|
||||
|
||||
[[maybe_unused]]
|
||||
|
|
|
@ -65,12 +65,8 @@ struct Field {
|
|||
Tags, /**< Message Tags */
|
||||
ThreadId, /**< Thread Id */
|
||||
To, /**< To: recipient */
|
||||
/*
|
||||
* <private>
|
||||
*/
|
||||
XBodyHtml, /**< HTML Body */
|
||||
|
||||
_count_ /**< Number of FieldIds */
|
||||
//
|
||||
_count_ /**< Number of Ids */
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
|
|||
Field::Flag::IncludeInSexp |
|
||||
Field::Flag::IndexableTerm,
|
||||
},
|
||||
|
||||
/* internal */
|
||||
{
|
||||
Field::Id::XBodyHtml,
|
||||
Field::Type::String,
|
||||
"htmlbody", {},
|
||||
"Message html body",
|
||||
{},
|
||||
{},
|
||||
Field::Flag::Internal
|
||||
},
|
||||
}};
|
||||
|
||||
/*
|
||||
|
|
|
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
|
|||
}
|
||||
|
||||
static void
|
||||
append_text(Option<std::string>& str, Option<std::string> app)
|
||||
append_text(Option<std::string>& str, Option<std::string>&& app)
|
||||
{
|
||||
if (!str)
|
||||
str = app;
|
||||
else if (app)
|
||||
if (!str && app)
|
||||
str = std::move(*app);
|
||||
else if (str && app)
|
||||
str.value() += app.value();
|
||||
}
|
||||
|
||||
|
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
|
|||
return;
|
||||
|
||||
submsg->for_each([&](auto&& parent, auto&& child_obj) {
|
||||
|
||||
/* XXX: we only handle one level */
|
||||
|
||||
/* NOTE: we only handle one level; ideally, we'd apply the whole
|
||||
parsing machinery recursively; so this a little crude. */
|
||||
if (!child_obj.is_part())
|
||||
return;
|
||||
|
||||
const auto ctype{child_obj.content_type()};
|
||||
if (!ctype || !ctype->is_type("text", "*"))
|
||||
if (const auto ctype{child_obj.content_type()}; !ctype)
|
||||
return;
|
||||
|
||||
else if (ctype->is_type("text", "plain"))
|
||||
append_text(info.embedded, MimePart{child_obj}.to_string());
|
||||
else if (ctype->is_type("text", "html")) {
|
||||
if (auto&& str{MimePart{child_obj}.to_string()}; str)
|
||||
append_text(info.embedded, html_to_text(*str));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
|
|||
break;
|
||||
case Field::Id::BodyText:
|
||||
doc.add(field.id, priv.body_txt);
|
||||
if (priv.body_html)
|
||||
doc.add(field.id, html_to_text(*priv.body_html));
|
||||
break;
|
||||
case Field::Id::Cc:
|
||||
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
||||
|
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
|
|||
case Field::Id::To:
|
||||
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
|
||||
break;
|
||||
/* internal fields */
|
||||
case Field::Id::XBodyHtml:
|
||||
doc.add(field.id, priv.body_html);
|
||||
break;
|
||||
/* LCOV_EXCL_START */
|
||||
case Field::Id::_count_:
|
||||
default:
|
||||
|
|
|
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
|
|||
if (bytes < 0)
|
||||
return Nothing;
|
||||
|
||||
buffer.data()[bytes]='\0';
|
||||
buffer.resize(buflen);
|
||||
buffer.resize(bytes + 1);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
|
|
@ -153,22 +153,10 @@ static void
|
|||
test_mu_find_02(void)
|
||||
{
|
||||
/* when matching html as if it were text,
|
||||
* 'bull' is also matched in arto.eml, •
|
||||
*/
|
||||
// search("bull", 1);
|
||||
// search("bull m:foo", 0);
|
||||
// search("bull m:/foo", 1);
|
||||
// search("bull m:/Foo", 1);
|
||||
// search("bull flag:attach", 1);
|
||||
// search("bull flag:a", 1);
|
||||
|
||||
search("bull", 2);
|
||||
search("bull m:foo", 0);
|
||||
search("bull m:/foo", 2);
|
||||
search("bull m:/Foo", 2);
|
||||
search("bull flag:attach", 1);
|
||||
search("bull flag:a", 1);
|
||||
* 'bull' is also matched in arto.eml, • however,
|
||||
* we don't do that anymore! */
|
||||
|
||||
search("bull", 1);
|
||||
|
||||
search("g:x", 0);
|
||||
search("flag:encrypted", 0);
|
||||
|
|
Loading…
Reference in New Issue