diff --git a/NEWS.org b/NEWS.org index c2651f66..5be15940 100644 --- a/NEWS.org +++ b/NEWS.org @@ -19,9 +19,14 @@ - what used to be the ~mu fields~ command has been merged into ~mu info~; i.e., ~mu fields~ is now ~mu info fields~. - - ~mu view~ gained ~--format=html~ for it to output the HTML body of the message - (if any) rather than the plain-text body. See its updated manpage for - details. + - ~mu view~ gained ~--format=html~ which compels it to output the HTML body of + the message rather than the (default) plain-text body. See its updated + manpage for details. + + - when encountering an HTML message part during indexing, previously (i.e., + ~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this + is now improved by employing a html->text scraper which extracts the + human-readable text from the html. - /experimental/: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux distros), ~mu~ will try to detect the language of the body of e-mail diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index 3108ddf6..4e4d9d24 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va throw std::logic_error("not a search term"); } -/* hack... import html text as if it were plain text. */ -static void -add_body_html(Xapian::Document& doc, const Field& field, const std::string& val) -{ - static Field body_field = field_from_id(Field::Id::BodyText); - - Xapian::TermGenerator termgen; - termgen.set_document(doc); - termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term()); -} - void Document::add(Field::Id id, const std::string& val) { @@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val) if (field.is_searchable()) add_search_term(xdoc_, field, val); - else if (id == Field::Id::XBodyHtml) - add_body_html(xdoc_, field, val); - if (field.include_in_sexp()) { + + if (field.include_in_sexp()) put_prop(field, val); - } } void diff --git a/lib/message/mu-fields.cc b/lib/message/mu-fields.cc index 8515636e..a8872101 100644 --- a/lib/message/mu-fields.cc +++ b/lib/message/mu-fields.cc @@ -139,7 +139,6 @@ static void test_prefix() { static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S'); - static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0); } [[maybe_unused]] diff --git a/lib/message/mu-fields.hh b/lib/message/mu-fields.hh index 234b4000..5aebb9e2 100644 --- a/lib/message/mu-fields.hh +++ b/lib/message/mu-fields.hh @@ -65,12 +65,8 @@ struct Field { Tags, /**< Message Tags */ ThreadId, /**< Thread Id */ To, /**< To: recipient */ - /* - * - */ - XBodyHtml, /**< HTML Body */ - - _count_ /**< Number of FieldIds */ + // + _count_ /**< Number of Ids */ }; /** @@ -458,17 +454,6 @@ static constexpr std::array Field::Flag::IncludeInSexp | Field::Flag::IndexableTerm, }, - - /* internal */ - { - Field::Id::XBodyHtml, - Field::Type::String, - "htmlbody", {}, - "Message html body", - {}, - {}, - Field::Flag::Internal - }, }}; /* diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index 114ad9c7..c026dfcb 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg) } static void -append_text(Option& str, Option app) +append_text(Option& str, Option&& app) { - if (!str) - str = app; - else if (app) + if (!str && app) + str = std::move(*app); + else if (str && app) str.value() += app.value(); } @@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part, return; submsg->for_each([&](auto&& parent, auto&& child_obj) { - - /* XXX: we only handle one level */ - + /* NOTE: we only handle one level; ideally, we'd apply the whole + parsing machinery recursively; so this a little crude. */ if (!child_obj.is_part()) return; - - const auto ctype{child_obj.content_type()}; - if (!ctype || !ctype->is_type("text", "*")) + if (const auto ctype{child_obj.content_type()}; !ctype) return; - - append_text(info.embedded, MimePart{child_obj}.to_string()); + else if (ctype->is_type("text", "plain")) + append_text(info.embedded, MimePart{child_obj}.to_string()); + else if (ctype->is_type("text", "html")) { + if (auto&& str{MimePart{child_obj}.to_string()}; str) + append_text(info.embedded, html_to_text(*str)); + } }); } @@ -662,6 +663,8 @@ fill_document(Message::Private& priv) break; case Field::Id::BodyText: doc.add(field.id, priv.body_txt); + if (priv.body_html) + doc.add(field.id, html_to_text(*priv.body_html)); break; case Field::Id::Cc: doc.add(field.id, mime_msg.contacts(Contact::Type::Cc)); @@ -725,10 +728,6 @@ fill_document(Message::Private& priv) case Field::Id::To: doc.add(field.id, mime_msg.contacts(Contact::Type::To)); break; - /* internal fields */ - case Field::Id::XBodyHtml: - doc.add(field.id, priv.body_html); - break; /* LCOV_EXCL_START */ case Field::Id::_count_: default: diff --git a/lib/message/mu-mime-object.cc b/lib/message/mu-mime-object.cc index f6f4bcef..a75da5be 100644 --- a/lib/message/mu-mime-object.cc +++ b/lib/message/mu-mime-object.cc @@ -535,8 +535,7 @@ MimePart::to_string() const noexcept if (bytes < 0) return Nothing; - buffer.data()[bytes]='\0'; - buffer.resize(buflen); + buffer.resize(bytes + 1); return buffer; } diff --git a/lib/utils/meson.build b/lib/utils/meson.build index fd5b72ff..c29b6d68 100644 --- a/lib/utils/meson.build +++ b/lib/utils/meson.build @@ -17,6 +17,7 @@ lib_mu_utils=static_library('mu-utils', [ 'mu-command-handler.cc', + 'mu-html-to-text.cc', 'mu-lang-detector.cc', 'mu-logger.cc', 'mu-option.cc', @@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency( include_directories(['.', '..', '../thirdparty']) ) +# +# tools +# +html2text = executable('mu-html2text', + 'mu-html-to-text.cc', + dependencies: [ lib_mu_utils_dep, glib_dep ], + cpp_args: ['-DBUILD_HTML_TO_TEXT'], + install: false) + # # tests # @@ -82,4 +92,11 @@ test('test-lang-detector', cpp_args: ['-DBUILD_TESTS'], dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ])) +test('test-html-to-text', + executable('test-html-to-text', 'mu-html-to-text.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [glib_dep, lib_mu_utils_dep])) + + subdir('tests') diff --git a/lib/utils/mu-html-to-text.cc b/lib/utils/mu-html-to-text.cc new file mode 100644 index 00000000..de91ea8d --- /dev/null +++ b/lib/utils/mu-html-to-text.cc @@ -0,0 +1,597 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-utils.hh" +#include "mu-option.hh" +#include "mu-regex.hh" + +#include +#include +#include + +using namespace Mu; + + +static bool +starts_with(std::string_view haystack, std::string_view needle) +{ + if (needle.size() > haystack.size()) + return false; + + for (auto&& c = 0U; c != needle.size(); ++c) + if (::tolower(haystack[c]) != ::tolower(needle[c])) + return false; + + return true; +} + +static bool +matches(std::string_view haystack, std::string_view needle) +{ + if (needle.size() != haystack.size()) + return false; + else + return starts_with(haystack, needle); +} + + + +/** + * HTML parsing context + * + */ +class Context { +public: + /** + * Construct a parsing context + * + * @param html some html to parse + */ + Context(const std::string& html): html_{html}, pos_{} {} + + /** + * Are we done with the html blob, i.e, has it been fully scraped? + * + * @return true or false + */ + bool done() const { + return pos_ >= html_.size(); + } + + /** + * Get the current position + * + * @return position + */ + size_t position() const { + return pos_; + } + + /** + * Get the size of the HTML + * + * @return size + */ + size_t size() const { + return html_.size(); + } + + /** + * Advance the position by _n_ characters. + * + * @param n number by which to advance. + */ + void advance(size_t n=1) { + if (pos_ + n > html_.size()) + throw std::range_error("out of range"); + pos_ += n; + } + + /** + * Are we looking at the given string? + * + * @param str string to match (case-insensitive) + * + * @return true or false + */ + bool looking_at(std::string_view str) const { + if (pos_ >= html_.size() || pos_ + str.size() >= html_.size()) + return false; + else + return matches({html_.data()+pos_, str.size()}, str); + } + + /** + * Grab a substring-view from the html + * + * @param fpos starting position + * @param len length + * + * @return string view + */ + std::string_view substr(size_t fpos, size_t len) const { + if (fpos + len > html_.size()) + throw std::range_error(mu_format("{} + {} > {}", + fpos, len, html_.size())); + else + return { html_.data() + fpos, len }; + } + + /** + * Grab the string of alphabetic characters at the + * head (pos) of the context, and advance over it. + * + * @return the head-word or empty + */ + std::string_view eat_head_word() { + size_t start_pos{pos_}; + while (!done()) { + if (!::isalpha(html_.at(pos_))) + break; + ++pos_; + } + return {html_.data() + start_pos, pos_ - start_pos}; + } + + + /** + * Get the scraped data; only available when done() + + * @return scraped data + */ + std::string scraped() { + return cleanup(raw_scraped_); + } + + /** + * Get the raw scrape buffer, where we can append + * scraped data. + * + * @return the buffer + */ + std::string& raw_scraped() { + return raw_scraped_; + } + + + /** + * Get a reference to the HTML + * + * @return html + */ + const std::string& html() const { return html_; } + +private: + + /** + * Cleanup some raw scraped html: remove superfluous + * whitespace, avoid too long lines. + * + * @param unclean + * + * @return cleaned up string. + */ + std::string cleanup(const std::string unclean) const { + // reduce whitespace and avoid too long lines; + // makes it easier to debug. + bool was_wspace{}; + size_t col{}; + std::string clean; + clean.reserve(unclean.size()/2); + for(auto&& c: unclean) { + auto wspace = c == ' ' || c == '\t' || c == '\n'; + if (wspace) { + was_wspace = true; + continue; + } + ++col; + if (was_wspace) { + if (col > 80) { + clean += '\n'; + col = 0; + } else if (!clean.empty()) + clean += ' '; + was_wspace = false; + } + clean += c; + } + return clean; + } + + + const std::string& html_; // no copy! + size_t pos_{}; + std::string raw_scraped_; +}; + + +G_GNUC_UNUSED static auto +format_as(const Context& ctx) +{ + return mu_format("<{}:{}: '{}'>", + ctx.position(), ctx.size(), + ctx.substr(ctx.position(), + std::min(static_cast(8), + ctx.size() - ctx.position()))); +} + + +static void +skip_quoted(Context& ctx, std::string_view quote) +{ + while(!ctx.done()) { + if (ctx.looking_at(quote)) // closing quote + return; + ctx.advance(); + } +} + + +// attempt to skip over