mirror of https://github.com/djcb/mu.git
message: use html-to-text scraper for html parts
We were dumping the HTML-parts as-is in the Xapian indexer; however, it's better to remove the html decoration first, and just pass the text. We use the new built-in html->text scraper for that.
This commit is contained in:
parent
56b8fad89e
commit
b795242d5a
11
NEWS.org
11
NEWS.org
|
@ -19,9 +19,14 @@
|
||||||
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
|
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
|
||||||
~mu fields~ is now ~mu info fields~.
|
~mu fields~ is now ~mu info fields~.
|
||||||
|
|
||||||
- ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
|
- ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
|
||||||
rather than the (default) plain-text body. See its updated manpage for
|
the message rather than the (default) plain-text body. See its updated
|
||||||
details.
|
manpage for details.
|
||||||
|
|
||||||
|
- when encountering an HTML message part during indexing, previously (i.e.,
|
||||||
|
~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
|
||||||
|
is now improved by employing a html->text scraper which extracts the
|
||||||
|
human-readable text from the html.
|
||||||
|
|
||||||
- experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
|
- experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
|
||||||
distros), ~mu~ will try to detect the language of the body of e-mail
|
distros), ~mu~ will try to detect the language of the body of e-mail
|
||||||
|
|
|
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
|
||||||
throw std::logic_error("not a search term");
|
throw std::logic_error("not a search term");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* hack... import html text as if it were plain text. */
|
|
||||||
static void
|
|
||||||
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
|
|
||||||
{
|
|
||||||
static Field body_field = field_from_id(Field::Id::BodyText);
|
|
||||||
|
|
||||||
Xapian::TermGenerator termgen;
|
|
||||||
termgen.set_document(doc);
|
|
||||||
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
Document::add(Field::Id id, const std::string& val)
|
Document::add(Field::Id id, const std::string& val)
|
||||||
{
|
{
|
||||||
|
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)
|
||||||
|
|
||||||
if (field.is_searchable())
|
if (field.is_searchable())
|
||||||
add_search_term(xdoc_, field, val);
|
add_search_term(xdoc_, field, val);
|
||||||
else if (id == Field::Id::XBodyHtml)
|
|
||||||
add_body_html(xdoc_, field, val);
|
if (field.include_in_sexp())
|
||||||
if (field.include_in_sexp()) {
|
|
||||||
put_prop(field, val);
|
put_prop(field, val);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -139,7 +139,6 @@ static void
|
||||||
test_prefix()
|
test_prefix()
|
||||||
{
|
{
|
||||||
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
|
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
|
||||||
static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[[maybe_unused]]
|
[[maybe_unused]]
|
||||||
|
|
|
@ -65,12 +65,8 @@ struct Field {
|
||||||
Tags, /**< Message Tags */
|
Tags, /**< Message Tags */
|
||||||
ThreadId, /**< Thread Id */
|
ThreadId, /**< Thread Id */
|
||||||
To, /**< To: recipient */
|
To, /**< To: recipient */
|
||||||
/*
|
//
|
||||||
* <private>
|
_count_ /**< Number of Ids */
|
||||||
*/
|
|
||||||
XBodyHtml, /**< HTML Body */
|
|
||||||
|
|
||||||
_count_ /**< Number of FieldIds */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
|
||||||
Field::Flag::IncludeInSexp |
|
Field::Flag::IncludeInSexp |
|
||||||
Field::Flag::IndexableTerm,
|
Field::Flag::IndexableTerm,
|
||||||
},
|
},
|
||||||
|
|
||||||
/* internal */
|
|
||||||
{
|
|
||||||
Field::Id::XBodyHtml,
|
|
||||||
Field::Type::String,
|
|
||||||
"htmlbody", {},
|
|
||||||
"Message html body",
|
|
||||||
{},
|
|
||||||
{},
|
|
||||||
Field::Flag::Internal
|
|
||||||
},
|
|
||||||
}};
|
}};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
append_text(Option<std::string>& str, Option<std::string> app)
|
append_text(Option<std::string>& str, Option<std::string>&& app)
|
||||||
{
|
{
|
||||||
if (!str)
|
if (!str && app)
|
||||||
str = app;
|
str = std::move(*app);
|
||||||
else if (app)
|
else if (str && app)
|
||||||
str.value() += app.value();
|
str.value() += app.value();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
submsg->for_each([&](auto&& parent, auto&& child_obj) {
|
submsg->for_each([&](auto&& parent, auto&& child_obj) {
|
||||||
|
/* NOTE: we only handle one level; ideally, we'd apply the whole
|
||||||
/* XXX: we only handle one level */
|
parsing machinery recursively; so this a little crude. */
|
||||||
|
|
||||||
if (!child_obj.is_part())
|
if (!child_obj.is_part())
|
||||||
return;
|
return;
|
||||||
|
if (const auto ctype{child_obj.content_type()}; !ctype)
|
||||||
const auto ctype{child_obj.content_type()};
|
|
||||||
if (!ctype || !ctype->is_type("text", "*"))
|
|
||||||
return;
|
return;
|
||||||
|
else if (ctype->is_type("text", "plain"))
|
||||||
append_text(info.embedded, MimePart{child_obj}.to_string());
|
append_text(info.embedded, MimePart{child_obj}.to_string());
|
||||||
|
else if (ctype->is_type("text", "html")) {
|
||||||
|
if (auto&& str{MimePart{child_obj}.to_string()}; str)
|
||||||
|
append_text(info.embedded, html_to_text(*str));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
|
||||||
break;
|
break;
|
||||||
case Field::Id::BodyText:
|
case Field::Id::BodyText:
|
||||||
doc.add(field.id, priv.body_txt);
|
doc.add(field.id, priv.body_txt);
|
||||||
|
if (priv.body_html)
|
||||||
|
doc.add(field.id, html_to_text(*priv.body_html));
|
||||||
break;
|
break;
|
||||||
case Field::Id::Cc:
|
case Field::Id::Cc:
|
||||||
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
||||||
|
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
|
||||||
case Field::Id::To:
|
case Field::Id::To:
|
||||||
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
|
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
|
||||||
break;
|
break;
|
||||||
/* internal fields */
|
|
||||||
case Field::Id::XBodyHtml:
|
|
||||||
doc.add(field.id, priv.body_html);
|
|
||||||
break;
|
|
||||||
/* LCOV_EXCL_START */
|
/* LCOV_EXCL_START */
|
||||||
case Field::Id::_count_:
|
case Field::Id::_count_:
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
|
||||||
if (bytes < 0)
|
if (bytes < 0)
|
||||||
return Nothing;
|
return Nothing;
|
||||||
|
|
||||||
buffer.data()[bytes]='\0';
|
buffer.resize(bytes + 1);
|
||||||
buffer.resize(buflen);
|
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
|
@ -153,22 +153,10 @@ static void
|
||||||
test_mu_find_02(void)
|
test_mu_find_02(void)
|
||||||
{
|
{
|
||||||
/* when matching html as if it were text,
|
/* when matching html as if it were text,
|
||||||
* 'bull' is also matched in arto.eml, •
|
* 'bull' is also matched in arto.eml, • however,
|
||||||
*/
|
* we don't do that anymore! */
|
||||||
// search("bull", 1);
|
|
||||||
// search("bull m:foo", 0);
|
|
||||||
// search("bull m:/foo", 1);
|
|
||||||
// search("bull m:/Foo", 1);
|
|
||||||
// search("bull flag:attach", 1);
|
|
||||||
// search("bull flag:a", 1);
|
|
||||||
|
|
||||||
search("bull", 2);
|
|
||||||
search("bull m:foo", 0);
|
|
||||||
search("bull m:/foo", 2);
|
|
||||||
search("bull m:/Foo", 2);
|
|
||||||
search("bull flag:attach", 1);
|
|
||||||
search("bull flag:a", 1);
|
|
||||||
|
|
||||||
|
search("bull", 1);
|
||||||
|
|
||||||
search("g:x", 0);
|
search("g:x", 0);
|
||||||
search("flag:encrypted", 0);
|
search("flag:encrypted", 0);
|
||||||
|
|
Loading…
Reference in New Issue