message: use html-to-text scraper for html parts

We were dumping the HTML-parts as-is in the Xapian indexer; however, it's better to remove the html decoration first, and just pass the text. We use the new built-in html->text scraper for that.
2023-07-23 14:46:11 +03:00 · 2023-07-23 14:46:11 +03:00 · b795242d5a
parent 56b8fad89e
commit b795242d5a
7 changed files with 31 additions and 69 deletions
--- a/NEWS.org
+++ b/NEWS.org
@ -19,9 +19,14 @@
    - what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
 ~mu fields~ is now ~mu info fields~.

-    - ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
-      rather than the (default) plain-text body. See its updated manpage for
-      details.
+    - ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
+      the message rather than the (default) plain-text body. See its updated
+      manpage for details.
+
+    - when encountering an HTML message part during indexing, previously (i.e.,
+ ~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
+      is now improved by employing a html->text scraper which extracts the
+      human-readable text from the html.

    - experimental: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
      distros), ~mu~ will try to detect the language of the body of e-mail
--- a/lib/message/mu-document.cc
+++ b/lib/message/mu-document.cc
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
 		throw std::logic_error("not a search term");
 }

-/* hack... import html text as if it were plain text. */
-static void
-add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
-{
-	static Field body_field = field_from_id(Field::Id::BodyText);
-
-	Xapian::TermGenerator termgen;
-	termgen.set_document(doc);
-	termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
-}
-
 void
 Document::add(Field::Id id, const std::string& val)
 {
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)

 	if (field.is_searchable())
 		add_search_term(xdoc_, field, val);
-	else if (id == Field::Id::XBodyHtml)
-		add_body_html(xdoc_, field, val);
-	if (field.include_in_sexp()) {
+
+	if (field.include_in_sexp())
 		put_prop(field, val);
-	}
 }

 void
--- a/lib/message/mu-fields.cc
+++ b/lib/message/mu-fields.cc
@ -139,7 +139,6 @@ static void
 test_prefix()
 {
 	static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
-	static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
 }

 [[maybe_unused]]
--- a/lib/message/mu-fields.hh
+++ b/lib/message/mu-fields.hh
@ -65,12 +65,8 @@ struct Field {
 		Tags,		/**< Message Tags */
 		ThreadId,	/**< Thread Id */
 		To,		/**< To: recipient */
-		/*
-		 * <private>
-		 */
-		XBodyHtml,	/**< HTML Body */
-
-		_count_ /**< Number of FieldIds */
+		//
+		_count_         /**< Number of Ids */
 	};

 	/**
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
 		Field::Flag::IncludeInSexp |
 		Field::Flag::IndexableTerm,
 	    },
-
-	    /* internal */
-	    {
-		Field::Id::XBodyHtml,
-		Field::Type::String,
-		"htmlbody", {},
-		"Message html body",
-		{},
-		{},
-		Field::Flag::Internal
-	    },
 	}};

 /*
--- a/lib/message/mu-message.cc
+++ b/lib/message/mu-message.cc
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
 }

 static void
-append_text(Option<std::string>& str, Option<std::string> app)
+append_text(Option<std::string>& str, Option<std::string>&& app)
 {
-	if (!str)
-		str = app;
-	else if (app)
+	if (!str && app)
+		str = std::move(*app);
+	else if (str && app)
 		str.value() += app.value();
 }

@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
 		return;

 	submsg->for_each([&](auto&& parent, auto&& child_obj) {
-
-		/* XXX: we only handle one level */
-
+		/* NOTE: we only handle one level; ideally, we'd apply the whole
+		   parsing machinery recursively; so this a little crude. */
 		if (!child_obj.is_part())
 			return;
-
-		const auto ctype{child_obj.content_type()};
-		if (!ctype || !ctype->is_type("text", "*"))
+		if (const auto ctype{child_obj.content_type()}; !ctype)
 			return;
-
-		append_text(info.embedded, MimePart{child_obj}.to_string());
+		else if (ctype->is_type("text", "plain"))
+			append_text(info.embedded, MimePart{child_obj}.to_string());
+		else if (ctype->is_type("text", "html")) {
+			if (auto&& str{MimePart{child_obj}.to_string()}; str)
+				append_text(info.embedded, html_to_text(*str));
+		}
 	});
 }

@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
 			break;
 		case Field::Id::BodyText:
 			doc.add(field.id, priv.body_txt);
+			if (priv.body_html)
+				doc.add(field.id, html_to_text(*priv.body_html));
 			break;
 		case Field::Id::Cc:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
 		case Field::Id::To:
 			doc.add(field.id, mime_msg.contacts(Contact::Type::To));
 			break;
-			/* internal fields */
-		case Field::Id::XBodyHtml:
-			doc.add(field.id, priv.body_html);
-			break;
 		/* LCOV_EXCL_START */
 		case Field::Id::_count_:
 		default:
--- a/lib/message/mu-mime-object.cc
+++ b/lib/message/mu-mime-object.cc
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
 	if (bytes < 0)
 		return Nothing;

-	buffer.data()[bytes]='\0';
-	buffer.resize(buflen);
+	buffer.resize(bytes + 1);

 	return buffer;
 }
--- a/mu/tests/test-mu-cmd.cc
+++ b/mu/tests/test-mu-cmd.cc
@ -153,22 +153,10 @@ static void
 test_mu_find_02(void)
 {
 	/* when matching html as if it were text,
-	 * 'bull' is also matched in arto.eml, &bull;
-	 */
-	// search("bull", 1);
-	// search("bull m:foo", 0);
-	// search("bull m:/foo", 1);
-	// search("bull m:/Foo", 1);
-	// search("bull flag:attach", 1);
-	// search("bull flag:a", 1);
-
-	search("bull", 2);
-	search("bull m:foo", 0);
-	search("bull m:/foo", 2);
-	search("bull m:/Foo", 2);
-	search("bull flag:attach", 1);
-	search("bull flag:a", 1);
+	 * 'bull' is also matched in arto.eml, &bull; however,
+	 * we don't do that anymore! */

+	search("bull", 1);

 	search("g:x", 0);
 	search("flag:encrypted", 0);