Merge branch 'wip/djcb/html-to-text'

This commit is contained in:
Dirk-Jan C. Binnema 2023-07-26 19:11:41 +03:00
commit 455119f695
10 changed files with 655 additions and 69 deletions

View File

@ -19,9 +19,14 @@
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
~mu fields~ is now ~mu info fields~.
- ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
(if any) rather than the plain-text body. See its updated manpage for
details.
- ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
the message rather than the (default) plain-text body. See its updated
manpage for details.
- when encountering an HTML message part during indexing, previously (i.e.,
~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
is now improved by employing a html->text scraper which extracts the
human-readable text from the html.
- /experimental/: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
distros), ~mu~ will try to detect the language of the body of e-mail

View File

@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
throw std::logic_error("not a search term");
}
/* hack... import html text as if it were plain text. */
static void
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
{
static Field body_field = field_from_id(Field::Id::BodyText);
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
}
void
Document::add(Field::Id id, const std::string& val)
{
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)
if (field.is_searchable())
add_search_term(xdoc_, field, val);
else if (id == Field::Id::XBodyHtml)
add_body_html(xdoc_, field, val);
if (field.include_in_sexp()) {
if (field.include_in_sexp())
put_prop(field, val);
}
}
void

View File

@ -139,7 +139,6 @@ static void
test_prefix()
{
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
}
[[maybe_unused]]

View File

@ -65,12 +65,8 @@ struct Field {
Tags, /**< Message Tags */
ThreadId, /**< Thread Id */
To, /**< To: recipient */
/*
* <private>
*/
XBodyHtml, /**< HTML Body */
_count_ /**< Number of FieldIds */
//
_count_ /**< Number of Ids */
};
/**
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm,
},
/* internal */
{
Field::Id::XBodyHtml,
Field::Type::String,
"htmlbody", {},
"Message html body",
{},
{},
Field::Flag::Internal
},
}};
/*

View File

@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
}
static void
append_text(Option<std::string>& str, Option<std::string> app)
append_text(Option<std::string>& str, Option<std::string>&& app)
{
if (!str)
str = app;
else if (app)
if (!str && app)
str = std::move(*app);
else if (str && app)
str.value() += app.value();
}
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
return;
submsg->for_each([&](auto&& parent, auto&& child_obj) {
/* XXX: we only handle one level */
/* NOTE: we only handle one level; ideally, we'd apply the whole
parsing machinery recursively; so this a little crude. */
if (!child_obj.is_part())
return;
const auto ctype{child_obj.content_type()};
if (!ctype || !ctype->is_type("text", "*"))
if (const auto ctype{child_obj.content_type()}; !ctype)
return;
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "plain"))
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "html")) {
if (auto&& str{MimePart{child_obj}.to_string()}; str)
append_text(info.embedded, html_to_text(*str));
}
});
}
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
break;
case Field::Id::BodyText:
doc.add(field.id, priv.body_txt);
if (priv.body_html)
doc.add(field.id, html_to_text(*priv.body_html));
break;
case Field::Id::Cc:
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
case Field::Id::To:
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
break;
/* internal fields */
case Field::Id::XBodyHtml:
doc.add(field.id, priv.body_html);
break;
/* LCOV_EXCL_START */
case Field::Id::_count_:
default:

View File

@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
if (bytes < 0)
return Nothing;
buffer.data()[bytes]='\0';
buffer.resize(buflen);
buffer.resize(bytes + 1);
return buffer;
}

View File

@ -17,6 +17,7 @@
lib_mu_utils=static_library('mu-utils', [
'mu-command-handler.cc',
'mu-html-to-text.cc',
'mu-lang-detector.cc',
'mu-logger.cc',
'mu-option.cc',
@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency(
include_directories(['.', '..', '../thirdparty'])
)
#
# tools
#
html2text = executable('mu-html2text',
'mu-html-to-text.cc',
dependencies: [ lib_mu_utils_dep, glib_dep ],
cpp_args: ['-DBUILD_HTML_TO_TEXT'],
install: false)
#
# tests
#
@ -82,4 +92,11 @@ test('test-lang-detector',
cpp_args: ['-DBUILD_TESTS'],
dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ]))
test('test-html-to-text',
executable('test-html-to-text', 'mu-html-to-text.cc',
install: false,
cpp_args: ['-DBUILD_TESTS'],
dependencies: [glib_dep, lib_mu_utils_dep]))
subdir('tests')

View File

@ -0,0 +1,597 @@
/*
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "mu-utils.hh"
#include "mu-option.hh"
#include "mu-regex.hh"
#include <string>
#include <string_view>
#include <algorithm>
using namespace Mu;
static bool
starts_with(std::string_view haystack, std::string_view needle)
{
if (needle.size() > haystack.size())
return false;
for (auto&& c = 0U; c != needle.size(); ++c)
if (::tolower(haystack[c]) != ::tolower(needle[c]))
return false;
return true;
}
static bool
matches(std::string_view haystack, std::string_view needle)
{
if (needle.size() != haystack.size())
return false;
else
return starts_with(haystack, needle);
}
/**
* HTML parsing context
*
*/
class Context {
public:
/**
* Construct a parsing context
*
* @param html some html to parse
*/
Context(const std::string& html): html_{html}, pos_{} {}
/**
* Are we done with the html blob, i.e, has it been fully scraped?
*
* @return true or false
*/
bool done() const {
return pos_ >= html_.size();
}
/**
* Get the current position
*
* @return position
*/
size_t position() const {
return pos_;
}
/**
* Get the size of the HTML
*
* @return size
*/
size_t size() const {
return html_.size();
}
/**
* Advance the position by _n_ characters.
*
* @param n number by which to advance.
*/
void advance(size_t n=1) {
if (pos_ + n > html_.size())
throw std::range_error("out of range");
pos_ += n;
}
/**
* Are we looking at the given string?
*
* @param str string to match (case-insensitive)
*
* @return true or false
*/
bool looking_at(std::string_view str) const {
if (pos_ >= html_.size() || pos_ + str.size() >= html_.size())
return false;
else
return matches({html_.data()+pos_, str.size()}, str);
}
/**
* Grab a substring-view from the html
*
* @param fpos starting position
* @param len length
*
* @return string view
*/
std::string_view substr(size_t fpos, size_t len) const {
if (fpos + len > html_.size())
throw std::range_error(mu_format("{} + {} > {}",
fpos, len, html_.size()));
else
return { html_.data() + fpos, len };
}
/**
* Grab the string of alphabetic characters at the
* head (pos) of the context, and advance over it.
*
* @return the head-word or empty
*/
std::string_view eat_head_word() {
size_t start_pos{pos_};
while (!done()) {
if (!::isalpha(html_.at(pos_)))
break;
++pos_;
}
return {html_.data() + start_pos, pos_ - start_pos};
}
/**
* Get the scraped data; only available when done()
* @return scraped data
*/
std::string scraped() {
return cleanup(raw_scraped_);
}
/**
* Get the raw scrape buffer, where we can append
* scraped data.
*
* @return the buffer
*/
std::string& raw_scraped() {
return raw_scraped_;
}
/**
* Get a reference to the HTML
*
* @return html
*/
const std::string& html() const { return html_; }
private:
/**
* Cleanup some raw scraped html: remove superfluous
* whitespace, avoid too long lines.
*
* @param unclean
*
* @return cleaned up string.
*/
std::string cleanup(const std::string unclean) const {
// reduce whitespace and avoid too long lines;
// makes it easier to debug.
bool was_wspace{};
size_t col{};
std::string clean;
clean.reserve(unclean.size()/2);
for(auto&& c: unclean) {
auto wspace = c == ' ' || c == '\t' || c == '\n';
if (wspace) {
was_wspace = true;
continue;
}
++col;
if (was_wspace) {
if (col > 80) {
clean += '\n';
col = 0;
} else if (!clean.empty())
clean += ' ';
was_wspace = false;
}
clean += c;
}
return clean;
}
const std::string& html_; // no copy!
size_t pos_{};
std::string raw_scraped_;
};
G_GNUC_UNUSED static auto
format_as(const Context& ctx)
{
return mu_format("<{}:{}: '{}'>",
ctx.position(), ctx.size(),
ctx.substr(ctx.position(),
std::min(static_cast<size_t>(8),
ctx.size() - ctx.position())));
}
static void
skip_quoted(Context& ctx, std::string_view quote)
{
while(!ctx.done()) {
if (ctx.looking_at(quote)) // closing quote
return;
ctx.advance();
}
}
// attempt to skip over <script> / <style> blocks
static void
skip_script_style(Context& ctx, std::string_view tag)
{
// <script> or <style> must be ignored
bool escaped{};
bool quoted{}, squoted{};
bool inl_comment{};
bool endl_comment{};
auto end_tag_str = mu_format("</{}>", tag);
auto end_tag = std::string_view(end_tag_str.data());
while (!ctx.done()) {
if (inl_comment) {
if (ctx.looking_at("*/")) {
inl_comment = false;
ctx.advance(2);
} else
ctx.advance();
continue;
}
if (endl_comment) {
endl_comment = ctx.looking_at("\n");
ctx.advance();
continue;
}
if (ctx.looking_at("\\")) {
escaped = !escaped;
ctx.advance();
continue;
}
if (ctx.looking_at("\"") && !escaped && squoted) {
quoted = !quoted;
ctx.advance();
continue;
}
if (ctx.looking_at("'") && !escaped && !quoted) {
squoted = !squoted;
ctx.advance();
continue;
}
if (ctx.looking_at("/*")) {
inl_comment = true;
ctx.advance(2);
continue;
}
if (ctx.looking_at("//")) {
endl_comment = true;
ctx.advance(2);
continue;
}
if (!quoted && !squoted && ctx.looking_at(end_tag)) {
ctx.advance(end_tag.size());
break; /* we're done, finally! */
}
ctx.advance();
}
}
// comment block; ignore completely
// pos will be immediately after the '<!--
static void
comment(Context& ctx)
{
constexpr std::string_view comment_endtag{"-->"};
while (!ctx.done()) {
if (ctx.looking_at(comment_endtag)) {
ctx.advance(comment_endtag.size());
ctx.raw_scraped() += ' ';
return;
}
ctx.advance();
}
}
static bool // do we need a SPC separator for this tag?
needs_separator(std::string_view tagname)
{
constexpr std::array nosep_tags = {
"b", "em", "i", "s", "strike", "tt", "u"
};
return !seq_some(nosep_tags, [&](auto&& t){return matches(tagname, t);});
}
static bool // do we need to skip the element completely?
is_skip_element(std::string_view tagname)
{
constexpr std::array skip_tags = {
"script", "style", "head", "meta"
};
return seq_some(skip_tags, [&](auto&& t){return matches(tagname, t);});
}
// skip the end-tag
static void
end_tag(Context& ctx)
{
while (!ctx.done()) {
if (ctx.looking_at(">")) {
ctx.advance();
return;
}
ctx.advance();
}
}
// skip the whole element
static void
skip_element(Context& ctx, std::string_view tagname)
{
// do something special?
}
// the start of a tag, i.e., pos will be just after the '<'
static void
tag(Context& ctx)
{
// some elements we want to skip completely,
// for others just the tags.
constexpr std::string_view comment_start {"!--"};
if (ctx.looking_at(comment_start)) {
ctx.advance(comment_start.size());
comment(ctx);
return;
}
if (ctx.looking_at("/")) {
ctx.advance();
end_tag(ctx);
return;
}
auto tagname = ctx.eat_head_word();
if (tagname == "script" ||tagname == "style") {
skip_script_style(ctx, tagname);
return;
}
else if (is_skip_element(tagname))
skip_element(ctx, tagname);
const auto needs_sepa = needs_separator(tagname);
while (!ctx.done()) {
if (ctx.looking_at("\""))
skip_quoted(ctx, "\"");
if (ctx.looking_at("'"))
skip_quoted(ctx, "'");
if (ctx.looking_at(">")) {
ctx.advance();
if (needs_sepa)
ctx.raw_scraped() += ' ';
return;
}
ctx.advance();
}
}
static void
html_escape_char(Context& ctx)
{
// we only care about a few accented chars, and add them unaccented, lowercase, since that's
// we do for indexing anyway.
constexpr std::array escs = {
"breve",
"caron",
"circ",
"cute",
"grave",
"horn"/*thorn*/,
"macr",
"slash",
"strok",
"tilde",
"uml",
};
auto unescape=[escs](std::string_view esc)->char {
if (esc.empty())
return ' ';
auto first{static_cast<char>(::tolower(esc.at(0)))};
auto rest=esc.substr(1);
if (seq_some(escs, [&](auto&& e){return starts_with(rest, e);}))
return first;
else
return ' ';
};
size_t start_pos{ctx.position()};
while (!ctx.done()) {
if (ctx.looking_at(";")) {
auto esc = ctx.substr(start_pos, ctx.position() - start_pos);
ctx.raw_scraped() += unescape(esc);
ctx.advance();
return;
}
ctx.advance();
}
}
// a block of text to be scraped
static void
text(Context& ctx)
{
size_t start_pos{ctx.position()};
while (!ctx.done()) {
if (ctx.looking_at("&")) {
ctx.raw_scraped() += ctx.substr(start_pos,
ctx.position() - start_pos);
ctx.advance();
html_escape_char(ctx);
start_pos = ctx.position();
} else if (ctx.looking_at("<")) {
ctx.raw_scraped() += ctx.substr(start_pos,
ctx.position() - start_pos);
ctx.advance();
tag(ctx);
start_pos = ctx.position();
} else
ctx.advance();
}
ctx.raw_scraped() += ctx.substr(start_pos, ctx.size() - start_pos);
}
static Context *CTX{};
std::string
Mu::html_to_text(const std::string& html)
{
Context ctx{html};
CTX = &ctx;
text(ctx);
CTX = {};
return ctx.scraped();
}
#ifdef BUILD_TESTS
#include "mu-test-utils.hh"
static void
test_1()
{
static std::vector<std::pair<std::string, std::string>>
tests = {
{ "<!-- Hello -->A", "A" },
{ "A<!-- Test -->B", "A B" },
{ "A<i>a</i><b>p</b>", "Aap"},
{ "N&ocute;&Ocirc;t", "Noot"},
{
"foo<!-- bar --><i>c</i>uu<bla>x</bla>"
"<!--hello -->world<!--",
"foo cuu x world"
}
};
for (auto&& test: tests)
assert_equal(html_to_text(test.first), test.second);
}
static void
test_2()
{
static std::vector<std::pair<std::string, std::string>>
tests = {
{ R"(<i>hello, <b bar="/b">world!</b>)",
"hello, world!"},
};
for (auto&& test: tests)
assert_equal(html_to_text(test.first), test.second);
}
static void
test_3()
{
static std::vector<std::pair<std::string, std::string>>
tests = {
{R"(<i>hello, </i><script language="javascript">
function foo() {
alert("Stroopwafel!"); // test
}
</script>world!)",
"hello, world!"},
};
for (auto&& test: tests)
assert_equal(html_to_text(test.first), test.second);
}
int
main(int argc, char* argv[])
{
mu_test_init(&argc, &argv);
g_test_add_func("/html-to-text/test-1", test_1);
g_test_add_func("/html-to-text/test-2", test_2);
g_test_add_func("/html-to-text/test-3", test_3);
return g_test_run();
}
#endif /*BUILD_TESTS*/
#ifdef BUILD_HTML_TO_TEXT
#include "mu-utils-file.hh"
// simple tool that reads html on stdin and outputs text on stdout
// e.g. curl --silent https://www.example.com | build/lib/utils/mu-html2text
int
main (int argc, char *argv[])
{
auto res = read_from_stdin();
if (!res) {
mu_printerrln("error reading from stdin: {}", res.error().what());
return 1;
}
mu_println("{}", html_to_text(*res));
return 0;
}
#endif /*BUILD_HTML_TO_TEXT*/

View File

@ -265,6 +265,16 @@ std::string date_to_time_t_string(int64_t t);
*/
std::string time_to_string(const char *frm, time_t t, bool utc = false) G_GNUC_CONST;
/**
* Crudely convert HTML to plain text. This attempts to scrape the
* human-readable text from html-email so we can use it for indexing.
*
* @param html html
*
* @return plain text
*/
std::string html_to_text(const std::string& html);
/**
* Hack to avoid locale crashes
*

View File

@ -153,22 +153,10 @@ static void
test_mu_find_02(void)
{
/* when matching html as if it were text,
* 'bull' is also matched in arto.eml, &bull;
*/
// search("bull", 1);
// search("bull m:foo", 0);
// search("bull m:/foo", 1);
// search("bull m:/Foo", 1);
// search("bull flag:attach", 1);
// search("bull flag:a", 1);
search("bull", 2);
search("bull m:foo", 0);
search("bull m:/foo", 2);
search("bull m:/Foo", 2);
search("bull flag:attach", 1);
search("bull flag:a", 1);
* 'bull' is also matched in arto.eml, &bull; however,
* we don't do that anymore! */
search("bull", 1);
search("g:x", 0);
search("flag:encrypted", 0);