Merge branch 'wip/djcb/html-to-text'

2023-07-26 19:11:41 +03:00 · 2023-07-26 19:11:41 +03:00 · 455119f695
parent 21a760d2c7 b795242d5a
commit 455119f695
10 changed files with 655 additions and 69 deletions
--- a/NEWS.org
+++ b/NEWS.org
@ -19,9 +19,14 @@
    - what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
 ~mu fields~ is now ~mu info fields~.
-    - ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
+    - ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
-      (if any) rather than the plain-text body. See its updated manpage for
+      the message rather than the (default) plain-text body. See its updated
-      details.
+      manpage for details.
    - when encountering an HTML message part during indexing, previously (i.e.,
 ~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
      is now improved by employing a html->text scraper which extracts the
      human-readable text from the html.
    - /experimental/: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
      distros), ~mu~ will try to detect the language of the body of e-mail
--- a/lib/message/mu-document.cc
+++ b/lib/message/mu-document.cc
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
 		throw std::logic_error("not a search term");
 }
 /* hack... import html text as if it were plain text. */
 static void
 add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
 {
 	static Field body_field = field_from_id(Field::Id::BodyText);
 	Xapian::TermGenerator termgen;
 	termgen.set_document(doc);
 	termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
 }
 void
 Document::add(Field::Id id, const std::string& val)
 {
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)
 	if (field.is_searchable())
 		add_search_term(xdoc_, field, val);
-	else if (id == Field::Id::XBodyHtml)
+
-		add_body_html(xdoc_, field, val);
+	if (field.include_in_sexp())
 	if (field.include_in_sexp()) {
 		put_prop(field, val);
 	}
 }
 void
--- a/lib/message/mu-fields.cc
+++ b/lib/message/mu-fields.cc
@ -139,7 +139,6 @@ static void
 test_prefix()
 {
 	static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
 	static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
 }
 [[maybe_unused]]
--- a/lib/message/mu-fields.hh
+++ b/lib/message/mu-fields.hh
@ -65,12 +65,8 @@ struct Field {
 		Tags,		/**< Message Tags */
 		ThreadId,	/**< Thread Id */
 		To,		/**< To: recipient */
-		/*
+		//
-		 * <private>
+		_count_         /**< Number of Ids */
 		 */
 		XBodyHtml,	/**< HTML Body */
 		_count_ /**< Number of FieldIds */
 	};
 	/**
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
 		Field::Flag::IncludeInSexp |
 		Field::Flag::IndexableTerm,
 	    },
 	    /* internal */
 	    {
 		Field::Id::XBodyHtml,
 		Field::Type::String,
 		"htmlbody", {},
 		"Message html body",
 		{},
 		{},
 		Field::Flag::Internal
 	    },
 	}};
 /*
--- a/lib/message/mu-message.cc
+++ b/lib/message/mu-message.cc
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
 }
 static void
-append_text(Option<std::string>& str, Option<std::string> app)
+append_text(Option<std::string>& str, Option<std::string>&& app)
 {
-	if (!str)
+	if (!str && app)
-		str = app;
+		str = std::move(*app);
-	else if (app)
+	else if (str && app)
 		str.value() += app.value();
 }
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
 		return;
 	submsg->for_each([&](auto&& parent, auto&& child_obj) {
-
+		/* NOTE: we only handle one level; ideally, we'd apply the whole
-		/* XXX: we only handle one level */
+		   parsing machinery recursively; so this a little crude. */
 		if (!child_obj.is_part())
 			return;
-
+		if (const auto ctype{child_obj.content_type()}; !ctype)
 		const auto ctype{child_obj.content_type()};
 		if (!ctype || !ctype->is_type("text", "*"))
 			return;
-
+		else if (ctype->is_type("text", "plain"))
-		append_text(info.embedded, MimePart{child_obj}.to_string());
+			append_text(info.embedded, MimePart{child_obj}.to_string());
 		else if (ctype->is_type("text", "html")) {
 			if (auto&& str{MimePart{child_obj}.to_string()}; str)
 				append_text(info.embedded, html_to_text(*str));
 		}
 	});
 }
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
 			break;
 		case Field::Id::BodyText:
 			doc.add(field.id, priv.body_txt);
 			if (priv.body_html)
 				doc.add(field.id, html_to_text(*priv.body_html));
 			break;
 		case Field::Id::Cc:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
 		case Field::Id::To:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::To));
 			break;
 			/* internal fields */
 		case Field::Id::XBodyHtml:
 			doc.add(field.id, priv.body_html);
 			break;
 		/* LCOV_EXCL_START */
 		case Field::Id::_count_:
 		default:
--- a/lib/message/mu-mime-object.cc
+++ b/lib/message/mu-mime-object.cc
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
 	if (bytes < 0)
 		return Nothing;
-	buffer.data()[bytes]='\0';
+	buffer.resize(bytes + 1);
 	buffer.resize(buflen);
 	return buffer;
 }
--- a/lib/utils/meson.build
+++ b/lib/utils/meson.build
@ -17,6 +17,7 @@
 lib_mu_utils=static_library('mu-utils', [
  'mu-command-handler.cc',
  'mu-html-to-text.cc',
  'mu-lang-detector.cc',
  'mu-logger.cc',
  'mu-option.cc',
@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency(
    include_directories(['.', '..', '../thirdparty'])
 )
 #
 # tools
 #
 html2text = executable('mu-html2text',
                       'mu-html-to-text.cc',
  dependencies: [ lib_mu_utils_dep, glib_dep ],
  cpp_args: ['-DBUILD_HTML_TO_TEXT'],
  install: false)
 #
 # tests
 #
@ -82,4 +92,11 @@ test('test-lang-detector',
                cpp_args: ['-DBUILD_TESTS'],
                dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ]))
 test('test-html-to-text',
     executable('test-html-to-text', 'mu-html-to-text.cc',
                install: false,
                cpp_args: ['-DBUILD_TESTS'],
                dependencies: [glib_dep, lib_mu_utils_dep]))
 subdir('tests')
--- a/lib/utils/mu-html-to-text.cc
+++ b/lib/utils/mu-html-to-text.cc
@ -0,0 +1,597 @@
 /*
 ** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
 **
 ** This program is free software; you can redistribute it and/or modify it
 ** under the terms of the GNU General Public License as published by the
 ** Free Software Foundation; either version 3, or (at your option) any
 ** later version.
 **
 ** This program is distributed in the hope that it will be useful,
 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ** GNU General Public License for more details.
 **
 ** You should have received a copy of the GNU General Public License
 ** along with this program; if not, write to the Free Software Foundation,
 ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 **
 */
 #include "mu-utils.hh"
 #include "mu-option.hh"
 #include "mu-regex.hh"
 #include <string>
 #include <string_view>
 #include <algorithm>
 using namespace Mu;
 static bool
 starts_with(std::string_view haystack, std::string_view needle)
 {
 	if (needle.size() > haystack.size())
 		return false;
 	for (auto&& c = 0U; c != needle.size(); ++c)
 		if (::tolower(haystack[c]) != ::tolower(needle[c]))
 			return false;
 	return true;
 }
 static bool
 matches(std::string_view haystack, std::string_view needle)
 {
 	if (needle.size() != haystack.size())
 		return false;
 	else
 		return starts_with(haystack, needle);
 }
 /**
 * HTML parsing context
 *
 */
 class Context {
 public:
 	/**
 	 * Construct a parsing context
 	 *
 	 * @param html some html to parse
 	 */
 	Context(const std::string& html): html_{html}, pos_{} {}
 	/**
 	 * Are we done with the html blob, i.e, has it been fully scraped?
 	 *
 	 * @return true or false
 	 */
 	bool done() const {
 		return pos_ >= html_.size();
 	}
 	/**
 	 * Get the current position
 	 *
 	 * @return position
 	 */
 	size_t position() const {
 		return pos_;
 	}
 	/**
 	 * Get the size of the HTML
 	 *
 	 * @return size
 	 */
 	size_t size() const {
 		return html_.size();
 	}
 	/**
 	 * Advance the position by _n_ characters.
 	 *
 	 * @param n number by which to advance.
 	 */
 	void advance(size_t n=1) {
 		if (pos_ + n > html_.size())
 			throw std::range_error("out of range");
 		pos_ += n;
 	}
 	/**
 	 * Are we looking at the given string?
 	 *
 	 * @param str string to match (case-insensitive)
 	 *
 	 * @return true or false
 	 */
 	bool looking_at(std::string_view str) const {
 		if (pos_ >= html_.size() || pos_ + str.size() >= html_.size())
 			return false;
 		else
 			return matches({html_.data()+pos_, str.size()}, str);
 	}
 	/**
 	 * Grab a substring-view from the html
 	 *
 	 * @param fpos starting position
 	 * @param len length
 	 *
 	 * @return string view
 	 */
 	std::string_view substr(size_t fpos, size_t len) const {
 		if (fpos + len > html_.size())
 			throw std::range_error(mu_format("{} + {} > {}",
 							 fpos, len, html_.size()));
 		else
 			return { html_.data() + fpos, len };
 	}
 	/**
 	 * Grab the string of alphabetic characters at the
 	 * head (pos) of the context, and advance over it.
 	 *
 	 * @return the head-word or empty
 	 */
 	std::string_view eat_head_word() {
 		size_t start_pos{pos_};
 		while (!done()) {
 			if (!::isalpha(html_.at(pos_)))
 				break;
 			++pos_;
 		}
 		return {html_.data() + start_pos, pos_ - start_pos};
 	}
 	/**
 	 * Get the scraped data; only available when done()
 	 * @return scraped data
 	 */
 	std::string scraped() {
 		return cleanup(raw_scraped_);
 	}
 	/**
 	 * Get the raw scrape buffer, where we can append
 	 * scraped data.
 	 *
 	 * @return the buffer
 	 */
 	std::string& raw_scraped() {
 		return raw_scraped_;
 	}
 	/**
 	 * Get a reference to the HTML
 	 *
 	 * @return  html
 	 */
 	const std::string& html() const { return html_; }
 private:
 	/**
 	 * Cleanup some raw scraped html: remove superfluous
 	 * whitespace, avoid too long lines.
 	 *
 	 * @param unclean
 	 *
 	 * @return cleaned up string.
 	 */
 	std::string cleanup(const std::string unclean) const {
 		// reduce whitespace and avoid too long lines;
 		// makes it easier to debug.
 		bool was_wspace{};
 		size_t col{};
 		std::string clean;
 		clean.reserve(unclean.size()/2);
 		for(auto&& c: unclean) {
 			auto wspace = c == ' ' || c == '\t' || c == '\n';
 			if (wspace) {
 				was_wspace = true;
 				continue;
 			}
 			++col;
 			if (was_wspace) {
 				if (col > 80) {
 					clean += '\n';
 					col = 0;
 				} else if (!clean.empty())
 					clean += ' ';
 				was_wspace = false;
 			}
 			clean += c;
 		}
 		return clean;
 	}
 	const std::string&	html_; // no copy!
 	size_t			pos_{};
 	std::string		raw_scraped_;
 };
 G_GNUC_UNUSED static auto
 format_as(const Context& ctx)
 {
 	return mu_format("<{}:{}: '{}'>",
 			 ctx.position(), ctx.size(),
 			 ctx.substr(ctx.position(),
 				    std::min(static_cast<size_t>(8),
 					     ctx.size() - ctx.position())));
 }
 static void
 skip_quoted(Context& ctx, std::string_view quote)
 {
 	while(!ctx.done()) {
 		if (ctx.looking_at(quote)) // closing quote
 			return;
 		ctx.advance();
 	}
 }
 // attempt to skip over <script> / <style> blocks
 static void
 skip_script_style(Context& ctx, std::string_view tag)
 {
 	// <script> or <style> must be ignored
 	bool escaped{};
 	bool quoted{}, squoted{};
 	bool inl_comment{};
 	bool endl_comment{};
 	auto end_tag_str = mu_format("</{}>", tag);
 	auto end_tag = std::string_view(end_tag_str.data());
 	while (!ctx.done()) {
 		if (inl_comment) {
 			if (ctx.looking_at("*/")) {
 				inl_comment = false;
 				ctx.advance(2);
 			} else
 				ctx.advance();
 			continue;
 		}
 		if (endl_comment) {
 			endl_comment = ctx.looking_at("\n");
 			ctx.advance();
 			continue;
 		}
 		if (ctx.looking_at("\\")) {
 			escaped = !escaped;
 			ctx.advance();
 			continue;
 		}
 		if (ctx.looking_at("\"") && !escaped && squoted)  {
 			quoted = !quoted;
 			ctx.advance();
 			continue;
 		}
 		if (ctx.looking_at("'") && !escaped && !quoted) {
 			squoted = !squoted;
 			ctx.advance();
 			continue;
 		}
 		if (ctx.looking_at("/*")) {
 			inl_comment = true;
 			ctx.advance(2);
 			continue;
 		}
 		if (ctx.looking_at("//")) {
 			endl_comment = true;
 			ctx.advance(2);
 			continue;
 		}
 		if (!quoted && !squoted && ctx.looking_at(end_tag)) {
 			ctx.advance(end_tag.size());
 			break; /* we're done, finally! */
 		}
 		ctx.advance();
 	}
 }
 // comment block; ignore completely
 // pos will be immediately after the '<!--
 static void
 comment(Context& ctx)
 {
 	constexpr std::string_view comment_endtag{"-->"};
 	while (!ctx.done()) {
 		if (ctx.looking_at(comment_endtag)) {
 			ctx.advance(comment_endtag.size());
 			ctx.raw_scraped() += ' ';
 			return;
 		}
 		ctx.advance();
 	}
 }
 static bool // do we need a SPC separator for this tag?
 needs_separator(std::string_view tagname)
 {
 	constexpr std::array nosep_tags = {
 		"b", "em", "i", "s", "strike", "tt", "u"
 	};
 	return !seq_some(nosep_tags, [&](auto&& t){return matches(tagname, t);});
 }
 static bool // do we need to skip the element completely?
 is_skip_element(std::string_view tagname)
 {
 	constexpr std::array skip_tags = {
 		"script", "style", "head", "meta"
 	};
 	return seq_some(skip_tags, [&](auto&& t){return matches(tagname, t);});
 }
 // skip the end-tag
 static void
 end_tag(Context& ctx)
 {
 	while (!ctx.done()) {
 		if (ctx.looking_at(">")) {
 			ctx.advance();
 			return;
 		}
 		ctx.advance();
 	}
 }
 // skip the whole element
 static void
 skip_element(Context& ctx, std::string_view tagname)
 {
 	// do something special?
 }
 // the start of a tag, i.e., pos will be just after the '<'
 static void
 tag(Context& ctx)
 {
 	// some elements we want to skip completely,
 	// for others just the tags.
 	constexpr std::string_view comment_start {"!--"};
 	if (ctx.looking_at(comment_start)) {
 		ctx.advance(comment_start.size());
 		comment(ctx);
 		return;
 	}
 	if (ctx.looking_at("/")) {
 		ctx.advance();
 		end_tag(ctx);
 		return;
 	}
 	auto tagname = ctx.eat_head_word();
 	if (tagname == "script" ||tagname == "style") {
 		skip_script_style(ctx, tagname);
 		return;
 	}
 	else if (is_skip_element(tagname))
 		skip_element(ctx, tagname);
 	const auto needs_sepa = needs_separator(tagname);
 	while (!ctx.done()) {
 		if (ctx.looking_at("\""))
 			skip_quoted(ctx, "\"");
 		if (ctx.looking_at("'"))
 			skip_quoted(ctx, "'");
 		if (ctx.looking_at(">")) {
 			ctx.advance();
 			if (needs_sepa)
 				ctx.raw_scraped() += ' ';
 			return;
 		}
 		ctx.advance();
 	}
 }
 static void
 html_escape_char(Context& ctx)
 {
 	// we only care about a few accented chars, and add them unaccented, lowercase, since that's
 	// we do for indexing anyway.
 	constexpr std::array escs = {
 		"breve",
 		"caron",
 		"circ",
 		"cute",
 		"grave",
 		"horn"/*thorn*/,
 		"macr",
 		"slash",
 		"strok",
 		"tilde",
 		"uml",
 	};
 	auto unescape=[escs](std::string_view esc)->char {
 		if (esc.empty())
 			return ' ';
 		auto first{static_cast<char>(::tolower(esc.at(0)))};
 		auto rest=esc.substr(1);
 		if (seq_some(escs, [&](auto&& e){return starts_with(rest, e);}))
 			return first;
 		else
 			return ' ';
 	};
 	size_t start_pos{ctx.position()};
 	while (!ctx.done()) {
 		if (ctx.looking_at(";")) {
 			auto esc = ctx.substr(start_pos, ctx.position() - start_pos);
 			ctx.raw_scraped() += unescape(esc);
 			ctx.advance();
 			return;
 		}
 		ctx.advance();
 	}
 }
 // a block of text to be scraped
 static void
 text(Context& ctx)
 {
 	size_t start_pos{ctx.position()};
 	while (!ctx.done()) {
 		if (ctx.looking_at("&")) {
 			ctx.raw_scraped() += ctx.substr(start_pos,
 							ctx.position() - start_pos);
 			ctx.advance();
 			html_escape_char(ctx);
 			start_pos = ctx.position();
 		} else if (ctx.looking_at("<")) {
 			ctx.raw_scraped() += ctx.substr(start_pos,
 							ctx.position() - start_pos);
 			ctx.advance();
 			tag(ctx);
 			start_pos = ctx.position();
 		} else
 			ctx.advance();
 	}
 	ctx.raw_scraped() += ctx.substr(start_pos, ctx.size() - start_pos);
 }
 static Context *CTX{};
 std::string
 Mu::html_to_text(const std::string& html)
 {
 	Context ctx{html};
 	CTX = &ctx;
 	text(ctx);
 	CTX = {};
 	return ctx.scraped();
 }
 #ifdef BUILD_TESTS
 #include "mu-test-utils.hh"
 static void
 test_1()
 {
 	static std::vector<std::pair<std::string, std::string>>
 		tests = {
 			{ "<!-- Hello -->A",  "A"   },
 			{ "A<!-- Test -->B", "A B"  },
 			{ "A<i>a</i><b>p</b>", "Aap"},
 			{ "N&ocute;&Ocirc;t", "Noot"},
 			{
 				"foo<!-- bar --><i>c</i>uu<bla>x</bla>"
 				"<!--hello -->world<!--",
 				"foo cuu x world"
 			}
 		};
 	for (auto&& test: tests)
 		assert_equal(html_to_text(test.first), test.second);
 }
 static void
 test_2()
 {
 	static std::vector<std::pair<std::string, std::string>>
 		tests = {
 			{ R"(<i>hello, <b bar="/b">world!</b>)",
 			  "hello, world!"},
 		};
 	for (auto&& test: tests)
 		assert_equal(html_to_text(test.first), test.second);
 }
 static void
 test_3()
 {
 	static std::vector<std::pair<std::string, std::string>>
 		tests = {
 			{R"(<i>hello, </i><script language="javascript">
 				function foo() {
 					alert("Stroopwafel!"); // test
 				}
 			    </script>world!)",
 			  "hello, world!"},
 		};
 	for (auto&& test: tests)
 		assert_equal(html_to_text(test.first), test.second);
 }
 int
 main(int argc, char* argv[])
 {
 	mu_test_init(&argc, &argv);
 	g_test_add_func("/html-to-text/test-1", test_1);
 	g_test_add_func("/html-to-text/test-2", test_2);
 	g_test_add_func("/html-to-text/test-3", test_3);
 	return g_test_run();
 }
 #endif /*BUILD_TESTS*/
 #ifdef BUILD_HTML_TO_TEXT
 #include "mu-utils-file.hh"
 // simple tool that reads html on stdin and outputs text on stdout
 // e.g. curl --silent https://www.example.com | build/lib/utils/mu-html2text
 int
 main (int argc, char *argv[])
 {
 	auto res = read_from_stdin();
 	if (!res) {
 		mu_printerrln("error reading from stdin: {}", res.error().what());
 		return 1;
 	}
 	mu_println("{}", html_to_text(*res));
 	return 0;
 }
 #endif /*BUILD_HTML_TO_TEXT*/
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@ -265,6 +265,16 @@ std::string date_to_time_t_string(int64_t t);
 */
 std::string time_to_string(const char *frm, time_t t, bool utc = false) G_GNUC_CONST;
 /**
 * Crudely convert HTML to plain text. This attempts to scrape the
 * human-readable text from html-email so we can use it for indexing.
 *
 * @param html html
 *
 * @return plain text
 */
 std::string html_to_text(const std::string& html);
 /**
 * Hack to avoid locale crashes
 *
--- a/mu/tests/test-mu-cmd.cc
+++ b/mu/tests/test-mu-cmd.cc
@ -153,22 +153,10 @@ static void
 test_mu_find_02(void)
 {
 	/* when matching html as if it were text,
-	 * 'bull' is also matched in arto.eml, &bull;
+	 * 'bull' is also matched in arto.eml, &bull; however,
-	 */
+	 * we don't do that anymore! */
 	// search("bull", 1);
 	// search("bull m:foo", 0);
 	// search("bull m:/foo", 1);
 	// search("bull m:/Foo", 1);
 	// search("bull flag:attach", 1);
 	// search("bull flag:a", 1);
 	search("bull", 2);
 	search("bull m:foo", 0);
 	search("bull m:/foo", 2);
 	search("bull m:/Foo", 2);
 	search("bull flag:attach", 1);
 	search("bull flag:a", 1);
 	search("bull", 1);
 	search("g:x", 0);
 	search("flag:encrypted", 0);