Merge branch 'wip/djcb/html-to-text'

2023-07-26 19:11:41 +03:00 · 2023-07-26 19:11:41 +03:00 · 455119f695
parent 21a760d2c7 b795242d5a
commit 455119f695
10 changed files with 655 additions and 69 deletions
--- a/NEWS.org
+++ b/NEWS.org
@ -19,9 +19,14 @@
    - what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
 ~mu fields~ is now ~mu info fields~.

-    - ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
-      (if any) rather than the plain-text body. See its updated manpage for
-      details.
+    - ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
+      the message rather than the (default) plain-text body. See its updated
+      manpage for details.
+
+    - when encountering an HTML message part during indexing, previously (i.e.,
+ ~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
+      is now improved by employing a html->text scraper which extracts the
+      human-readable text from the html.

    - /experimental/: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
      distros), ~mu~ will try to detect the language of the body of e-mail
--- a/lib/message/mu-document.cc
+++ b/lib/message/mu-document.cc
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
 		throw std::logic_error("not a search term");
 }

-/* hack... import html text as if it were plain text. */
-static void
-add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
-{
-	static Field body_field = field_from_id(Field::Id::BodyText);
-
-	Xapian::TermGenerator termgen;
-	termgen.set_document(doc);
-	termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
-}
-
 void
 Document::add(Field::Id id, const std::string& val)
 {
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)

 	if (field.is_searchable())
 		add_search_term(xdoc_, field, val);
-	else if (id == Field::Id::XBodyHtml)
-		add_body_html(xdoc_, field, val);
-	if (field.include_in_sexp()) {
+
+	if (field.include_in_sexp())
 		put_prop(field, val);
-	}
 }

 void
--- a/lib/message/mu-fields.cc
+++ b/lib/message/mu-fields.cc
@ -139,7 +139,6 @@ static void
 test_prefix()
 {
 	static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
-	static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
 }

 [[maybe_unused]]
--- a/lib/message/mu-fields.hh
+++ b/lib/message/mu-fields.hh
@ -65,12 +65,8 @@ struct Field {
 		Tags,		/**< Message Tags */
 		ThreadId,	/**< Thread Id */
 		To,		/**< To: recipient */
-		/*
-		 * <private>
-		 */
-		XBodyHtml,	/**< HTML Body */
-
-		_count_ /**< Number of FieldIds */
+		//
+		_count_         /**< Number of Ids */
 	};

 	/**
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
 		Field::Flag::IncludeInSexp |
 		Field::Flag::IndexableTerm,
 	    },
-
-	    /* internal */
-	    {
-		Field::Id::XBodyHtml,
-		Field::Type::String,
-		"htmlbody", {},
-		"Message html body",
-		{},
-		{},
-		Field::Flag::Internal
-	    },
 	}};

 /*
--- a/lib/message/mu-message.cc
+++ b/lib/message/mu-message.cc
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
 }

 static void
-append_text(Option<std::string>& str, Option<std::string> app)
+append_text(Option<std::string>& str, Option<std::string>&& app)
 {
-	if (!str)
-		str = app;
-	else if (app)
+	if (!str && app)
+		str = std::move(*app);
+	else if (str && app)
 		str.value() += app.value();
 }

@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
 		return;

 	submsg->for_each([&](auto&& parent, auto&& child_obj) {
-
-		/* XXX: we only handle one level */
-
+		/* NOTE: we only handle one level; ideally, we'd apply the whole
+		   parsing machinery recursively; so this a little crude. */
 		if (!child_obj.is_part())
 			return;
-
-		const auto ctype{child_obj.content_type()};
-		if (!ctype || !ctype->is_type("text", "*"))
+		if (const auto ctype{child_obj.content_type()}; !ctype)
 			return;
-
-		append_text(info.embedded, MimePart{child_obj}.to_string());
+		else if (ctype->is_type("text", "plain"))
+			append_text(info.embedded, MimePart{child_obj}.to_string());
+		else if (ctype->is_type("text", "html")) {
+			if (auto&& str{MimePart{child_obj}.to_string()}; str)
+				append_text(info.embedded, html_to_text(*str));
+		}
 	});
 }

@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
 			break;
 		case Field::Id::BodyText:
 			doc.add(field.id, priv.body_txt);
+			if (priv.body_html)
+				doc.add(field.id, html_to_text(*priv.body_html));
 			break;
 		case Field::Id::Cc:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
 		case Field::Id::To:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::To));
 			break;
-			/* internal fields */
-		case Field::Id::XBodyHtml:
-			doc.add(field.id, priv.body_html);
-			break;
 		/* LCOV_EXCL_START */
 		case Field::Id::_count_:
 		default:
--- a/lib/message/mu-mime-object.cc
+++ b/lib/message/mu-mime-object.cc
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
 	if (bytes < 0)
 		return Nothing;

-	buffer.data()[bytes]='\0';
-	buffer.resize(buflen);
+	buffer.resize(bytes + 1);

 	return buffer;
 }
--- a/lib/utils/meson.build
+++ b/lib/utils/meson.build
@ -17,6 +17,7 @@

 lib_mu_utils=static_library('mu-utils', [
  'mu-command-handler.cc',
+  'mu-html-to-text.cc',
  'mu-lang-detector.cc',
  'mu-logger.cc',
  'mu-option.cc',
@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency(
    include_directories(['.', '..', '../thirdparty'])
 )

+#
+# tools
+#
+html2text = executable('mu-html2text',
+                       'mu-html-to-text.cc',
+  dependencies: [ lib_mu_utils_dep, glib_dep ],
+  cpp_args: ['-DBUILD_HTML_TO_TEXT'],
+  install: false)
+
 #
 # tests
 #
@ -82,4 +92,11 @@ test('test-lang-detector',
                cpp_args: ['-DBUILD_TESTS'],
                dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ]))

+test('test-html-to-text',
+     executable('test-html-to-text', 'mu-html-to-text.cc',
+                install: false,
+                cpp_args: ['-DBUILD_TESTS'],
+                dependencies: [glib_dep, lib_mu_utils_dep]))
+
+
 subdir('tests')
--- a/lib/utils/mu-html-to-text.cc
+++ b/lib/utils/mu-html-to-text.cc
@ -0,0 +1,597 @@
+/*
+** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+** This program is free software; you can redistribute it and/or modify it
+** under the terms of the GNU General Public License as published by the
+** Free Software Foundation; either version 3, or (at your option) any
+** later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software Foundation,
+** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+**
+*/
+
+#include "mu-utils.hh"
+#include "mu-option.hh"
+#include "mu-regex.hh"
+
+#include <string>
+#include <string_view>
+#include <algorithm>
+
+using namespace Mu;
+
+
+static bool
+starts_with(std::string_view haystack, std::string_view needle)
+{
+	if (needle.size() > haystack.size())
+		return false;
+
+	for (auto&& c = 0U; c != needle.size(); ++c)
+		if (::tolower(haystack[c]) != ::tolower(needle[c]))
+			return false;
+
+	return true;
+}
+
+static bool
+matches(std::string_view haystack, std::string_view needle)
+{
+	if (needle.size() != haystack.size())
+		return false;
+	else
+		return starts_with(haystack, needle);
+}
+
+
+
+/**
+ * HTML parsing context
+ *
+ */
+class Context {
+public:
+	/**
+	 * Construct a parsing context
+	 *
+	 * @param html some html to parse
+	 */
+	Context(const std::string& html): html_{html}, pos_{} {}
+
+	/**
+	 * Are we done with the html blob, i.e, has it been fully scraped?
+	 *
+	 * @return true or false
+	 */
+	bool done() const {
+		return pos_ >= html_.size();
+	}
+
+	/**
+	 * Get the current position
+	 *
+	 * @return position
+	 */
+	size_t position() const {
+		return pos_;
+	}
+
+	/**
+	 * Get the size of the HTML
+	 *
+	 * @return size
+	 */
+	size_t size() const {
+		return html_.size();
+	}
+
+	/**
+	 * Advance the position by _n_ characters.
+	 *
+	 * @param n number by which to advance.
+	 */
+	void advance(size_t n=1) {
+		if (pos_ + n > html_.size())
+			throw std::range_error("out of range");
+		pos_ += n;
+	}
+
+	/**
+	 * Are we looking at the given string?
+	 *
+	 * @param str string to match (case-insensitive)
+	 *
+	 * @return true or false
+	 */
+	bool looking_at(std::string_view str) const {
+		if (pos_ >= html_.size() || pos_ + str.size() >= html_.size())
+			return false;
+		else
+			return matches({html_.data()+pos_, str.size()}, str);
+	}
+
+	/**
+	 * Grab a substring-view from the html
+	 *
+	 * @param fpos starting position
+	 * @param len length
+	 *
+	 * @return string view
+	 */
+	std::string_view substr(size_t fpos, size_t len) const {
+		if (fpos + len > html_.size())
+			throw std::range_error(mu_format("{} + {} > {}",
+							 fpos, len, html_.size()));
+		else
+			return { html_.data() + fpos, len };
+	}
+
+	/**
+	 * Grab the string of alphabetic characters at the
+	 * head (pos) of the context, and advance over it.
+	 *
+	 * @return the head-word or empty
+	 */
+	std::string_view eat_head_word() {
+		size_t start_pos{pos_};
+		while (!done()) {
+			if (!::isalpha(html_.at(pos_)))
+				break;
+			++pos_;
+		}
+		return {html_.data() + start_pos, pos_ - start_pos};
+	}
+
+
+	/**
+	 * Get the scraped data; only available when done()
+
+	 * @return scraped data
+	 */
+	std::string scraped() {
+		return cleanup(raw_scraped_);
+	}
+
+	/**
+	 * Get the raw scrape buffer, where we can append
+	 * scraped data.
+	 *
+	 * @return the buffer
+	 */
+	std::string& raw_scraped() {
+		return raw_scraped_;
+	}
+
+
+	/**
+	 * Get a reference to the HTML
+	 *
+	 * @return  html
+	 */
+	const std::string& html() const { return html_; }
+
+private:
+
+	/**
+	 * Cleanup some raw scraped html: remove superfluous
+	 * whitespace, avoid too long lines.
+	 *
+	 * @param unclean
+	 *
+	 * @return cleaned up string.
+	 */
+	std::string cleanup(const std::string unclean) const {
+		// reduce whitespace and avoid too long lines;
+		// makes it easier to debug.
+		bool was_wspace{};
+		size_t col{};
+		std::string clean;
+		clean.reserve(unclean.size()/2);
+		for(auto&& c: unclean) {
+			auto wspace = c == ' ' || c == '\t' || c == '\n';
+			if (wspace) {
+				was_wspace = true;
+				continue;
+			}
+			++col;
+			if (was_wspace) {
+				if (col > 80) {
+					clean += '\n';
+					col = 0;
+				} else if (!clean.empty())
+					clean += ' ';
+				was_wspace = false;
+			}
+			clean += c;
+		}
+		return clean;
+	}
+
+
+	const std::string&	html_; // no copy!
+	size_t			pos_{};
+	std::string		raw_scraped_;
+};
+
+
+G_GNUC_UNUSED static auto
+format_as(const Context& ctx)
+{
+	return mu_format("<{}:{}: '{}'>",
+			 ctx.position(), ctx.size(),
+			 ctx.substr(ctx.position(),
+				    std::min(static_cast<size_t>(8),
+					     ctx.size() - ctx.position())));
+}
+
+
+static void
+skip_quoted(Context& ctx, std::string_view quote)
+{
+	while(!ctx.done()) {
+		if (ctx.looking_at(quote)) // closing quote
+			return;
+		ctx.advance();
+	}
+}
+
+
+// attempt to skip over <script> / <style> blocks
+static void
+skip_script_style(Context& ctx, std::string_view tag)
+{
+	// <script> or <style> must be ignored
+
+	bool escaped{};
+	bool quoted{}, squoted{};
+	bool inl_comment{};
+	bool endl_comment{};
+
+	auto end_tag_str = mu_format("</{}>", tag);
+	auto end_tag = std::string_view(end_tag_str.data());
+
+	while (!ctx.done()) {
+
+		if (inl_comment) {
+			if (ctx.looking_at("*/")) {
+				inl_comment = false;
+				ctx.advance(2);
+			} else
+				ctx.advance();
+			continue;
+		}
+
+		if (endl_comment) {
+			endl_comment = ctx.looking_at("\n");
+			ctx.advance();
+			continue;
+		}
+
+		if (ctx.looking_at("\\")) {
+			escaped = !escaped;
+			ctx.advance();
+			continue;
+		}
+
+		if (ctx.looking_at("\"") && !escaped && squoted)  {
+			quoted = !quoted;
+			ctx.advance();
+			continue;
+		}
+
+		if (ctx.looking_at("'") && !escaped && !quoted) {
+			squoted = !squoted;
+			ctx.advance();
+			continue;
+		}
+
+
+		if (ctx.looking_at("/*")) {
+			inl_comment = true;
+			ctx.advance(2);
+			continue;
+		}
+
+		if (ctx.looking_at("//")) {
+			endl_comment = true;
+			ctx.advance(2);
+			continue;
+		}
+
+		if (!quoted && !squoted && ctx.looking_at(end_tag)) {
+			ctx.advance(end_tag.size());
+			break; /* we're done, finally! */
+		}
+
+		ctx.advance();
+	}
+}
+
+// comment block; ignore completely
+// pos will be immediately after the '<!--
+static void
+comment(Context& ctx)
+{
+	constexpr std::string_view comment_endtag{"-->"};
+	while (!ctx.done()) {
+
+		if (ctx.looking_at(comment_endtag)) {
+			ctx.advance(comment_endtag.size());
+			ctx.raw_scraped() += ' ';
+			return;
+		}
+		ctx.advance();
+	}
+}
+
+static bool // do we need a SPC separator for this tag?
+needs_separator(std::string_view tagname)
+{
+	constexpr std::array nosep_tags = {
+		"b", "em", "i", "s", "strike", "tt", "u"
+	};
+	return !seq_some(nosep_tags, [&](auto&& t){return matches(tagname, t);});
+}
+
+static bool // do we need to skip the element completely?
+is_skip_element(std::string_view tagname)
+{
+	constexpr std::array skip_tags = {
+		"script", "style", "head", "meta"
+	};
+	return seq_some(skip_tags, [&](auto&& t){return matches(tagname, t);});
+}
+
+// skip the end-tag
+static void
+end_tag(Context& ctx)
+{
+	while (!ctx.done()) {
+		if (ctx.looking_at(">")) {
+			ctx.advance();
+			return;
+		}
+		ctx.advance();
+	}
+}
+
+// skip the whole element
+static void
+skip_element(Context& ctx, std::string_view tagname)
+{
+	// do something special?
+}
+
+
+// the start of a tag, i.e., pos will be just after the '<'
+static void
+tag(Context& ctx)
+{
+	// some elements we want to skip completely,
+	// for others just the tags.
+	constexpr std::string_view comment_start {"!--"};
+	if (ctx.looking_at(comment_start)) {
+		ctx.advance(comment_start.size());
+		comment(ctx);
+		return;
+	}
+
+	if (ctx.looking_at("/")) {
+		ctx.advance();
+		end_tag(ctx);
+		return;
+	}
+
+	auto tagname = ctx.eat_head_word();
+	if (tagname == "script" ||tagname == "style") {
+		skip_script_style(ctx, tagname);
+		return;
+	}
+	else if (is_skip_element(tagname))
+		skip_element(ctx, tagname);
+
+	const auto needs_sepa = needs_separator(tagname);
+	while (!ctx.done()) {
+
+		if (ctx.looking_at("\""))
+			skip_quoted(ctx, "\"");
+
+		if (ctx.looking_at("'"))
+			skip_quoted(ctx, "'");
+
+		if (ctx.looking_at(">")) {
+			ctx.advance();
+			if (needs_sepa)
+				ctx.raw_scraped() += ' ';
+			return;
+		}
+		ctx.advance();
+	}
+}
+
+
+static void
+html_escape_char(Context& ctx)
+{
+	// we only care about a few accented chars, and add them unaccented, lowercase, since that's
+	// we do for indexing anyway.
+	constexpr std::array escs = {
+		"breve",
+		"caron",
+		"circ",
+		"cute",
+		"grave",
+		"horn"/*thorn*/,
+		"macr",
+		"slash",
+		"strok",
+		"tilde",
+		"uml",
+	};
+
+	auto unescape=[escs](std::string_view esc)->char {
+		if (esc.empty())
+			return ' ';
+		auto first{static_cast<char>(::tolower(esc.at(0)))};
+		auto rest=esc.substr(1);
+		if (seq_some(escs, [&](auto&& e){return starts_with(rest, e);}))
+			return first;
+		else
+			return ' ';
+	};
+
+	size_t start_pos{ctx.position()};
+	while (!ctx.done()) {
+		if (ctx.looking_at(";")) {
+			auto esc = ctx.substr(start_pos, ctx.position() - start_pos);
+			ctx.raw_scraped() += unescape(esc);
+			ctx.advance();
+			return;
+		}
+		ctx.advance();
+	}
+}
+
+
+// a block of text to be scraped
+static void
+text(Context& ctx)
+{
+	size_t start_pos{ctx.position()};
+	while (!ctx.done()) {
+
+		if (ctx.looking_at("&")) {
+
+			ctx.raw_scraped() += ctx.substr(start_pos,
+							ctx.position() - start_pos);
+			ctx.advance();
+			html_escape_char(ctx);
+			start_pos = ctx.position();
+
+		} else if (ctx.looking_at("<")) {
+			ctx.raw_scraped() += ctx.substr(start_pos,
+							ctx.position() - start_pos);
+			ctx.advance();
+			tag(ctx);
+			start_pos = ctx.position();
+
+		} else
+			ctx.advance();
+	}
+
+	ctx.raw_scraped() += ctx.substr(start_pos, ctx.size() - start_pos);
+}
+
+static Context *CTX{};
+
+std::string
+Mu::html_to_text(const std::string& html)
+{
+	Context ctx{html};
+	CTX = &ctx;
+
+	text(ctx);
+
+	CTX = {};
+	return ctx.scraped();
+}
+
+#ifdef BUILD_TESTS
+#include "mu-test-utils.hh"
+
+static void
+test_1()
+{
+	static std::vector<std::pair<std::string, std::string>>
+		tests = {
+			{ "<!-- Hello -->A",  "A"   },
+			{ "A<!-- Test -->B", "A B"  },
+			{ "A<i>a</i><b>p</b>", "Aap"},
+			{ "N&ocute;&Ocirc;t", "Noot"},
+			{
+				"foo<!-- bar --><i>c</i>uu<bla>x</bla>"
+				"<!--hello -->world<!--",
+				"foo cuu x world"
+			}
+		};
+
+	for (auto&& test: tests)
+		assert_equal(html_to_text(test.first), test.second);
+}
+
+static void
+test_2()
+{
+	static std::vector<std::pair<std::string, std::string>>
+		tests = {
+			{ R"(<i>hello, <b bar="/b">world!</b>)",
+			  "hello, world!"},
+		};
+
+	for (auto&& test: tests)
+		assert_equal(html_to_text(test.first), test.second);
+}
+
+
+static void
+test_3()
+{
+	static std::vector<std::pair<std::string, std::string>>
+		tests = {
+			{R"(<i>hello, </i><script language="javascript">
+				function foo() {
+					alert("Stroopwafel!"); // test
+				}
+			    </script>world!)",
+			  "hello, world!"},
+		};
+
+	for (auto&& test: tests)
+		assert_equal(html_to_text(test.first), test.second);
+}
+
+int
+main(int argc, char* argv[])
+{
+	mu_test_init(&argc, &argv);
+
+	g_test_add_func("/html-to-text/test-1", test_1);
+	g_test_add_func("/html-to-text/test-2", test_2);
+	g_test_add_func("/html-to-text/test-3", test_3);
+
+	return g_test_run();
+}
+
+
+#endif /*BUILD_TESTS*/
+
+
+#ifdef BUILD_HTML_TO_TEXT
+
+#include "mu-utils-file.hh"
+
+// simple tool that reads html on stdin and outputs text on stdout
+// e.g. curl --silent https://www.example.com | build/lib/utils/mu-html2text
+
+int
+main (int argc, char *argv[])
+{
+	auto res = read_from_stdin();
+	if (!res) {
+		mu_printerrln("error reading from stdin: {}", res.error().what());
+		return 1;
+	}
+
+	mu_println("{}", html_to_text(*res));
+
+	return 0;
+}
+
+#endif /*BUILD_HTML_TO_TEXT*/
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@ -265,6 +265,16 @@ std::string date_to_time_t_string(int64_t t);
 */
 std::string time_to_string(const char *frm, time_t t, bool utc = false) G_GNUC_CONST;

+/**
+ * Crudely convert HTML to plain text. This attempts to scrape the
+ * human-readable text from html-email so we can use it for indexing.
+ *
+ * @param html html
+ *
+ * @return plain text
+ */
+std::string html_to_text(const std::string& html);
+
 /**
 * Hack to avoid locale crashes
 *
--- a/mu/tests/test-mu-cmd.cc
+++ b/mu/tests/test-mu-cmd.cc
@ -153,22 +153,10 @@ static void
 test_mu_find_02(void)
 {
 	/* when matching html as if it were text,
-	 * 'bull' is also matched in arto.eml, &bull;
-	 */
-	// search("bull", 1);
-	// search("bull m:foo", 0);
-	// search("bull m:/foo", 1);
-	// search("bull m:/Foo", 1);
-	// search("bull flag:attach", 1);
-	// search("bull flag:a", 1);
-
-	search("bull", 2);
-	search("bull m:foo", 0);
-	search("bull m:/foo", 2);
-	search("bull m:/Foo", 2);
-	search("bull flag:attach", 1);
-	search("bull flag:a", 1);
+	 * 'bull' is also matched in arto.eml, &bull; however,
+	 * we don't do that anymore! */

+	search("bull", 1);

 	search("g:x", 0);
 	search("flag:encrypted", 0);