From 56b8fad89e2f8c07a1e0e25380f36ff42094a9ad Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Mon, 3 Jul 2023 20:29:51 +0300 Subject: [PATCH 1/2] utils: implement html-to-text Implement a crude html-to-text scraper function, to extract plain text from html messages, so we can use it for indexing. --- lib/utils/meson.build | 17 + lib/utils/mu-html-to-text.cc | 597 +++++++++++++++++++++++++++++++++++ lib/utils/mu-utils.hh | 10 + 3 files changed, 624 insertions(+) create mode 100644 lib/utils/mu-html-to-text.cc diff --git a/lib/utils/meson.build b/lib/utils/meson.build index fd5b72ff..c29b6d68 100644 --- a/lib/utils/meson.build +++ b/lib/utils/meson.build @@ -17,6 +17,7 @@ lib_mu_utils=static_library('mu-utils', [ 'mu-command-handler.cc', + 'mu-html-to-text.cc', 'mu-lang-detector.cc', 'mu-logger.cc', 'mu-option.cc', @@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency( include_directories(['.', '..', '../thirdparty']) ) +# +# tools +# +html2text = executable('mu-html2text', + 'mu-html-to-text.cc', + dependencies: [ lib_mu_utils_dep, glib_dep ], + cpp_args: ['-DBUILD_HTML_TO_TEXT'], + install: false) + # # tests # @@ -82,4 +92,11 @@ test('test-lang-detector', cpp_args: ['-DBUILD_TESTS'], dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ])) +test('test-html-to-text', + executable('test-html-to-text', 'mu-html-to-text.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [glib_dep, lib_mu_utils_dep])) + + subdir('tests') diff --git a/lib/utils/mu-html-to-text.cc b/lib/utils/mu-html-to-text.cc new file mode 100644 index 00000000..de91ea8d --- /dev/null +++ b/lib/utils/mu-html-to-text.cc @@ -0,0 +1,597 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#include "mu-utils.hh" +#include "mu-option.hh" +#include "mu-regex.hh" + +#include +#include +#include + +using namespace Mu; + + +static bool +starts_with(std::string_view haystack, std::string_view needle) +{ + if (needle.size() > haystack.size()) + return false; + + for (auto&& c = 0U; c != needle.size(); ++c) + if (::tolower(haystack[c]) != ::tolower(needle[c])) + return false; + + return true; +} + +static bool +matches(std::string_view haystack, std::string_view needle) +{ + if (needle.size() != haystack.size()) + return false; + else + return starts_with(haystack, needle); +} + + + +/** + * HTML parsing context + * + */ +class Context { +public: + /** + * Construct a parsing context + * + * @param html some html to parse + */ + Context(const std::string& html): html_{html}, pos_{} {} + + /** + * Are we done with the html blob, i.e, has it been fully scraped? + * + * @return true or false + */ + bool done() const { + return pos_ >= html_.size(); + } + + /** + * Get the current position + * + * @return position + */ + size_t position() const { + return pos_; + } + + /** + * Get the size of the HTML + * + * @return size + */ + size_t size() const { + return html_.size(); + } + + /** + * Advance the position by _n_ characters. + * + * @param n number by which to advance. + */ + void advance(size_t n=1) { + if (pos_ + n > html_.size()) + throw std::range_error("out of range"); + pos_ += n; + } + + /** + * Are we looking at the given string? + * + * @param str string to match (case-insensitive) + * + * @return true or false + */ + bool looking_at(std::string_view str) const { + if (pos_ >= html_.size() || pos_ + str.size() >= html_.size()) + return false; + else + return matches({html_.data()+pos_, str.size()}, str); + } + + /** + * Grab a substring-view from the html + * + * @param fpos starting position + * @param len length + * + * @return string view + */ + std::string_view substr(size_t fpos, size_t len) const { + if (fpos + len > html_.size()) + throw std::range_error(mu_format("{} + {} > {}", + fpos, len, html_.size())); + else + return { html_.data() + fpos, len }; + } + + /** + * Grab the string of alphabetic characters at the + * head (pos) of the context, and advance over it. + * + * @return the head-word or empty + */ + std::string_view eat_head_word() { + size_t start_pos{pos_}; + while (!done()) { + if (!::isalpha(html_.at(pos_))) + break; + ++pos_; + } + return {html_.data() + start_pos, pos_ - start_pos}; + } + + + /** + * Get the scraped data; only available when done() + + * @return scraped data + */ + std::string scraped() { + return cleanup(raw_scraped_); + } + + /** + * Get the raw scrape buffer, where we can append + * scraped data. + * + * @return the buffer + */ + std::string& raw_scraped() { + return raw_scraped_; + } + + + /** + * Get a reference to the HTML + * + * @return html + */ + const std::string& html() const { return html_; } + +private: + + /** + * Cleanup some raw scraped html: remove superfluous + * whitespace, avoid too long lines. + * + * @param unclean + * + * @return cleaned up string. + */ + std::string cleanup(const std::string unclean) const { + // reduce whitespace and avoid too long lines; + // makes it easier to debug. + bool was_wspace{}; + size_t col{}; + std::string clean; + clean.reserve(unclean.size()/2); + for(auto&& c: unclean) { + auto wspace = c == ' ' || c == '\t' || c == '\n'; + if (wspace) { + was_wspace = true; + continue; + } + ++col; + if (was_wspace) { + if (col > 80) { + clean += '\n'; + col = 0; + } else if (!clean.empty()) + clean += ' '; + was_wspace = false; + } + clean += c; + } + return clean; + } + + + const std::string& html_; // no copy! + size_t pos_{}; + std::string raw_scraped_; +}; + + +G_GNUC_UNUSED static auto +format_as(const Context& ctx) +{ + return mu_format("<{}:{}: '{}'>", + ctx.position(), ctx.size(), + ctx.substr(ctx.position(), + std::min(static_cast(8), + ctx.size() - ctx.position()))); +} + + +static void +skip_quoted(Context& ctx, std::string_view quote) +{ + while(!ctx.done()) { + if (ctx.looking_at(quote)) // closing quote + return; + ctx.advance(); + } +} + + +// attempt to skip over