mirror of https://github.com/djcb/mu.git
Merge branch 'wip/djcb/html-to-text'
This commit is contained in:
commit
455119f695
11
NEWS.org
11
NEWS.org
|
@ -19,9 +19,14 @@
|
|||
- what used to be the ~mu fields~ command has been merged into ~mu info~; i.e.,
|
||||
~mu fields~ is now ~mu info fields~.
|
||||
|
||||
- ~mu view~ gained ~--format=html~ for it to output the HTML body of the message
|
||||
(if any) rather than the plain-text body. See its updated manpage for
|
||||
details.
|
||||
- ~mu view~ gained ~--format=html~ which compels it to output the HTML body of
|
||||
the message rather than the (default) plain-text body. See its updated
|
||||
manpage for details.
|
||||
|
||||
- when encountering an HTML message part during indexing, previously (i.e.,
|
||||
~mu 1.10~) we would attempt to process that as-is, with HTML-tags etc.; this
|
||||
is now improved by employing a html->text scraper which extracts the
|
||||
human-readable text from the html.
|
||||
|
||||
- /experimental/: if you build ~mu~ with [[https://github.com/CLD2Owners/cld2][CLD2]] support (available in many Linux
|
||||
distros), ~mu~ will try to detect the language of the body of e-mail
|
||||
|
|
|
@ -79,17 +79,6 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
|
|||
throw std::logic_error("not a search term");
|
||||
}
|
||||
|
||||
/* hack... import html text as if it were plain text. */
|
||||
static void
|
||||
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
|
||||
{
|
||||
static Field body_field = field_from_id(Field::Id::BodyText);
|
||||
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(doc);
|
||||
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
|
||||
}
|
||||
|
||||
void
|
||||
Document::add(Field::Id id, const std::string& val)
|
||||
{
|
||||
|
@ -100,11 +89,9 @@ Document::add(Field::Id id, const std::string& val)
|
|||
|
||||
if (field.is_searchable())
|
||||
add_search_term(xdoc_, field, val);
|
||||
else if (id == Field::Id::XBodyHtml)
|
||||
add_body_html(xdoc_, field, val);
|
||||
if (field.include_in_sexp()) {
|
||||
|
||||
if (field.include_in_sexp())
|
||||
put_prop(field, val);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -139,7 +139,6 @@ static void
|
|||
test_prefix()
|
||||
{
|
||||
static_assert(field_from_id(Field::Id::Subject).xapian_prefix() == 'S');
|
||||
static_assert(field_from_id(Field::Id::XBodyHtml).xapian_prefix() == 0);
|
||||
}
|
||||
|
||||
[[maybe_unused]]
|
||||
|
|
|
@ -65,12 +65,8 @@ struct Field {
|
|||
Tags, /**< Message Tags */
|
||||
ThreadId, /**< Thread Id */
|
||||
To, /**< To: recipient */
|
||||
/*
|
||||
* <private>
|
||||
*/
|
||||
XBodyHtml, /**< HTML Body */
|
||||
|
||||
_count_ /**< Number of FieldIds */
|
||||
//
|
||||
_count_ /**< Number of Ids */
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -458,17 +454,6 @@ static constexpr std::array<Field, Field::id_size()>
|
|||
Field::Flag::IncludeInSexp |
|
||||
Field::Flag::IndexableTerm,
|
||||
},
|
||||
|
||||
/* internal */
|
||||
{
|
||||
Field::Id::XBodyHtml,
|
||||
Field::Type::String,
|
||||
"htmlbody", {},
|
||||
"Message html body",
|
||||
{},
|
||||
{},
|
||||
Field::Flag::Internal
|
||||
},
|
||||
}};
|
||||
|
||||
/*
|
||||
|
|
|
@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
|
|||
}
|
||||
|
||||
static void
|
||||
append_text(Option<std::string>& str, Option<std::string> app)
|
||||
append_text(Option<std::string>& str, Option<std::string>&& app)
|
||||
{
|
||||
if (!str)
|
||||
str = app;
|
||||
else if (app)
|
||||
if (!str && app)
|
||||
str = std::move(*app);
|
||||
else if (str && app)
|
||||
str.value() += app.value();
|
||||
}
|
||||
|
||||
|
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
|
|||
return;
|
||||
|
||||
submsg->for_each([&](auto&& parent, auto&& child_obj) {
|
||||
|
||||
/* XXX: we only handle one level */
|
||||
|
||||
/* NOTE: we only handle one level; ideally, we'd apply the whole
|
||||
parsing machinery recursively; so this a little crude. */
|
||||
if (!child_obj.is_part())
|
||||
return;
|
||||
|
||||
const auto ctype{child_obj.content_type()};
|
||||
if (!ctype || !ctype->is_type("text", "*"))
|
||||
if (const auto ctype{child_obj.content_type()}; !ctype)
|
||||
return;
|
||||
|
||||
append_text(info.embedded, MimePart{child_obj}.to_string());
|
||||
else if (ctype->is_type("text", "plain"))
|
||||
append_text(info.embedded, MimePart{child_obj}.to_string());
|
||||
else if (ctype->is_type("text", "html")) {
|
||||
if (auto&& str{MimePart{child_obj}.to_string()}; str)
|
||||
append_text(info.embedded, html_to_text(*str));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
|
|||
break;
|
||||
case Field::Id::BodyText:
|
||||
doc.add(field.id, priv.body_txt);
|
||||
if (priv.body_html)
|
||||
doc.add(field.id, html_to_text(*priv.body_html));
|
||||
break;
|
||||
case Field::Id::Cc:
|
||||
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
||||
|
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
|
|||
case Field::Id::To:
|
||||
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
|
||||
break;
|
||||
/* internal fields */
|
||||
case Field::Id::XBodyHtml:
|
||||
doc.add(field.id, priv.body_html);
|
||||
break;
|
||||
/* LCOV_EXCL_START */
|
||||
case Field::Id::_count_:
|
||||
default:
|
||||
|
|
|
@ -535,8 +535,7 @@ MimePart::to_string() const noexcept
|
|||
if (bytes < 0)
|
||||
return Nothing;
|
||||
|
||||
buffer.data()[bytes]='\0';
|
||||
buffer.resize(buflen);
|
||||
buffer.resize(bytes + 1);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
lib_mu_utils=static_library('mu-utils', [
|
||||
'mu-command-handler.cc',
|
||||
'mu-html-to-text.cc',
|
||||
'mu-lang-detector.cc',
|
||||
'mu-logger.cc',
|
||||
'mu-option.cc',
|
||||
|
@ -43,6 +44,15 @@ lib_mu_utils_dep = declare_dependency(
|
|||
include_directories(['.', '..', '../thirdparty'])
|
||||
)
|
||||
|
||||
#
|
||||
# tools
|
||||
#
|
||||
html2text = executable('mu-html2text',
|
||||
'mu-html-to-text.cc',
|
||||
dependencies: [ lib_mu_utils_dep, glib_dep ],
|
||||
cpp_args: ['-DBUILD_HTML_TO_TEXT'],
|
||||
install: false)
|
||||
|
||||
#
|
||||
# tests
|
||||
#
|
||||
|
@ -82,4 +92,11 @@ test('test-lang-detector',
|
|||
cpp_args: ['-DBUILD_TESTS'],
|
||||
dependencies: [ config_h_dep, glib_dep, lib_mu_utils_dep ]))
|
||||
|
||||
test('test-html-to-text',
|
||||
executable('test-html-to-text', 'mu-html-to-text.cc',
|
||||
install: false,
|
||||
cpp_args: ['-DBUILD_TESTS'],
|
||||
dependencies: [glib_dep, lib_mu_utils_dep]))
|
||||
|
||||
|
||||
subdir('tests')
|
||||
|
|
|
@ -0,0 +1,597 @@
|
|||
/*
|
||||
** Copyright (C) 2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
** Free Software Foundation; either version 3, or (at your option) any
|
||||
** later version.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU General Public License
|
||||
** along with this program; if not, write to the Free Software Foundation,
|
||||
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
**
|
||||
*/
|
||||
|
||||
#include "mu-utils.hh"
|
||||
#include "mu-option.hh"
|
||||
#include "mu-regex.hh"
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace Mu;
|
||||
|
||||
|
||||
static bool
|
||||
starts_with(std::string_view haystack, std::string_view needle)
|
||||
{
|
||||
if (needle.size() > haystack.size())
|
||||
return false;
|
||||
|
||||
for (auto&& c = 0U; c != needle.size(); ++c)
|
||||
if (::tolower(haystack[c]) != ::tolower(needle[c]))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
matches(std::string_view haystack, std::string_view needle)
|
||||
{
|
||||
if (needle.size() != haystack.size())
|
||||
return false;
|
||||
else
|
||||
return starts_with(haystack, needle);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* HTML parsing context
|
||||
*
|
||||
*/
|
||||
class Context {
|
||||
public:
|
||||
/**
|
||||
* Construct a parsing context
|
||||
*
|
||||
* @param html some html to parse
|
||||
*/
|
||||
Context(const std::string& html): html_{html}, pos_{} {}
|
||||
|
||||
/**
|
||||
* Are we done with the html blob, i.e, has it been fully scraped?
|
||||
*
|
||||
* @return true or false
|
||||
*/
|
||||
bool done() const {
|
||||
return pos_ >= html_.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current position
|
||||
*
|
||||
* @return position
|
||||
*/
|
||||
size_t position() const {
|
||||
return pos_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of the HTML
|
||||
*
|
||||
* @return size
|
||||
*/
|
||||
size_t size() const {
|
||||
return html_.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the position by _n_ characters.
|
||||
*
|
||||
* @param n number by which to advance.
|
||||
*/
|
||||
void advance(size_t n=1) {
|
||||
if (pos_ + n > html_.size())
|
||||
throw std::range_error("out of range");
|
||||
pos_ += n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we looking at the given string?
|
||||
*
|
||||
* @param str string to match (case-insensitive)
|
||||
*
|
||||
* @return true or false
|
||||
*/
|
||||
bool looking_at(std::string_view str) const {
|
||||
if (pos_ >= html_.size() || pos_ + str.size() >= html_.size())
|
||||
return false;
|
||||
else
|
||||
return matches({html_.data()+pos_, str.size()}, str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab a substring-view from the html
|
||||
*
|
||||
* @param fpos starting position
|
||||
* @param len length
|
||||
*
|
||||
* @return string view
|
||||
*/
|
||||
std::string_view substr(size_t fpos, size_t len) const {
|
||||
if (fpos + len > html_.size())
|
||||
throw std::range_error(mu_format("{} + {} > {}",
|
||||
fpos, len, html_.size()));
|
||||
else
|
||||
return { html_.data() + fpos, len };
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the string of alphabetic characters at the
|
||||
* head (pos) of the context, and advance over it.
|
||||
*
|
||||
* @return the head-word or empty
|
||||
*/
|
||||
std::string_view eat_head_word() {
|
||||
size_t start_pos{pos_};
|
||||
while (!done()) {
|
||||
if (!::isalpha(html_.at(pos_)))
|
||||
break;
|
||||
++pos_;
|
||||
}
|
||||
return {html_.data() + start_pos, pos_ - start_pos};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the scraped data; only available when done()
|
||||
|
||||
* @return scraped data
|
||||
*/
|
||||
std::string scraped() {
|
||||
return cleanup(raw_scraped_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the raw scrape buffer, where we can append
|
||||
* scraped data.
|
||||
*
|
||||
* @return the buffer
|
||||
*/
|
||||
std::string& raw_scraped() {
|
||||
return raw_scraped_;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get a reference to the HTML
|
||||
*
|
||||
* @return html
|
||||
*/
|
||||
const std::string& html() const { return html_; }
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Cleanup some raw scraped html: remove superfluous
|
||||
* whitespace, avoid too long lines.
|
||||
*
|
||||
* @param unclean
|
||||
*
|
||||
* @return cleaned up string.
|
||||
*/
|
||||
std::string cleanup(const std::string unclean) const {
|
||||
// reduce whitespace and avoid too long lines;
|
||||
// makes it easier to debug.
|
||||
bool was_wspace{};
|
||||
size_t col{};
|
||||
std::string clean;
|
||||
clean.reserve(unclean.size()/2);
|
||||
for(auto&& c: unclean) {
|
||||
auto wspace = c == ' ' || c == '\t' || c == '\n';
|
||||
if (wspace) {
|
||||
was_wspace = true;
|
||||
continue;
|
||||
}
|
||||
++col;
|
||||
if (was_wspace) {
|
||||
if (col > 80) {
|
||||
clean += '\n';
|
||||
col = 0;
|
||||
} else if (!clean.empty())
|
||||
clean += ' ';
|
||||
was_wspace = false;
|
||||
}
|
||||
clean += c;
|
||||
}
|
||||
return clean;
|
||||
}
|
||||
|
||||
|
||||
const std::string& html_; // no copy!
|
||||
size_t pos_{};
|
||||
std::string raw_scraped_;
|
||||
};
|
||||
|
||||
|
||||
G_GNUC_UNUSED static auto
|
||||
format_as(const Context& ctx)
|
||||
{
|
||||
return mu_format("<{}:{}: '{}'>",
|
||||
ctx.position(), ctx.size(),
|
||||
ctx.substr(ctx.position(),
|
||||
std::min(static_cast<size_t>(8),
|
||||
ctx.size() - ctx.position())));
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
skip_quoted(Context& ctx, std::string_view quote)
|
||||
{
|
||||
while(!ctx.done()) {
|
||||
if (ctx.looking_at(quote)) // closing quote
|
||||
return;
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// attempt to skip over <script> / <style> blocks
|
||||
static void
|
||||
skip_script_style(Context& ctx, std::string_view tag)
|
||||
{
|
||||
// <script> or <style> must be ignored
|
||||
|
||||
bool escaped{};
|
||||
bool quoted{}, squoted{};
|
||||
bool inl_comment{};
|
||||
bool endl_comment{};
|
||||
|
||||
auto end_tag_str = mu_format("</{}>", tag);
|
||||
auto end_tag = std::string_view(end_tag_str.data());
|
||||
|
||||
while (!ctx.done()) {
|
||||
|
||||
if (inl_comment) {
|
||||
if (ctx.looking_at("*/")) {
|
||||
inl_comment = false;
|
||||
ctx.advance(2);
|
||||
} else
|
||||
ctx.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (endl_comment) {
|
||||
endl_comment = ctx.looking_at("\n");
|
||||
ctx.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx.looking_at("\\")) {
|
||||
escaped = !escaped;
|
||||
ctx.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx.looking_at("\"") && !escaped && squoted) {
|
||||
quoted = !quoted;
|
||||
ctx.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx.looking_at("'") && !escaped && !quoted) {
|
||||
squoted = !squoted;
|
||||
ctx.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (ctx.looking_at("/*")) {
|
||||
inl_comment = true;
|
||||
ctx.advance(2);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx.looking_at("//")) {
|
||||
endl_comment = true;
|
||||
ctx.advance(2);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!quoted && !squoted && ctx.looking_at(end_tag)) {
|
||||
ctx.advance(end_tag.size());
|
||||
break; /* we're done, finally! */
|
||||
}
|
||||
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
// comment block; ignore completely
|
||||
// pos will be immediately after the '<!--
|
||||
static void
|
||||
comment(Context& ctx)
|
||||
{
|
||||
constexpr std::string_view comment_endtag{"-->"};
|
||||
while (!ctx.done()) {
|
||||
|
||||
if (ctx.looking_at(comment_endtag)) {
|
||||
ctx.advance(comment_endtag.size());
|
||||
ctx.raw_scraped() += ' ';
|
||||
return;
|
||||
}
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
static bool // do we need a SPC separator for this tag?
|
||||
needs_separator(std::string_view tagname)
|
||||
{
|
||||
constexpr std::array nosep_tags = {
|
||||
"b", "em", "i", "s", "strike", "tt", "u"
|
||||
};
|
||||
return !seq_some(nosep_tags, [&](auto&& t){return matches(tagname, t);});
|
||||
}
|
||||
|
||||
static bool // do we need to skip the element completely?
|
||||
is_skip_element(std::string_view tagname)
|
||||
{
|
||||
constexpr std::array skip_tags = {
|
||||
"script", "style", "head", "meta"
|
||||
};
|
||||
return seq_some(skip_tags, [&](auto&& t){return matches(tagname, t);});
|
||||
}
|
||||
|
||||
// skip the end-tag
|
||||
static void
|
||||
end_tag(Context& ctx)
|
||||
{
|
||||
while (!ctx.done()) {
|
||||
if (ctx.looking_at(">")) {
|
||||
ctx.advance();
|
||||
return;
|
||||
}
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
// skip the whole element
|
||||
static void
|
||||
skip_element(Context& ctx, std::string_view tagname)
|
||||
{
|
||||
// do something special?
|
||||
}
|
||||
|
||||
|
||||
// the start of a tag, i.e., pos will be just after the '<'
|
||||
static void
|
||||
tag(Context& ctx)
|
||||
{
|
||||
// some elements we want to skip completely,
|
||||
// for others just the tags.
|
||||
constexpr std::string_view comment_start {"!--"};
|
||||
if (ctx.looking_at(comment_start)) {
|
||||
ctx.advance(comment_start.size());
|
||||
comment(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx.looking_at("/")) {
|
||||
ctx.advance();
|
||||
end_tag(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
auto tagname = ctx.eat_head_word();
|
||||
if (tagname == "script" ||tagname == "style") {
|
||||
skip_script_style(ctx, tagname);
|
||||
return;
|
||||
}
|
||||
else if (is_skip_element(tagname))
|
||||
skip_element(ctx, tagname);
|
||||
|
||||
const auto needs_sepa = needs_separator(tagname);
|
||||
while (!ctx.done()) {
|
||||
|
||||
if (ctx.looking_at("\""))
|
||||
skip_quoted(ctx, "\"");
|
||||
|
||||
if (ctx.looking_at("'"))
|
||||
skip_quoted(ctx, "'");
|
||||
|
||||
if (ctx.looking_at(">")) {
|
||||
ctx.advance();
|
||||
if (needs_sepa)
|
||||
ctx.raw_scraped() += ' ';
|
||||
return;
|
||||
}
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
html_escape_char(Context& ctx)
|
||||
{
|
||||
// we only care about a few accented chars, and add them unaccented, lowercase, since that's
|
||||
// we do for indexing anyway.
|
||||
constexpr std::array escs = {
|
||||
"breve",
|
||||
"caron",
|
||||
"circ",
|
||||
"cute",
|
||||
"grave",
|
||||
"horn"/*thorn*/,
|
||||
"macr",
|
||||
"slash",
|
||||
"strok",
|
||||
"tilde",
|
||||
"uml",
|
||||
};
|
||||
|
||||
auto unescape=[escs](std::string_view esc)->char {
|
||||
if (esc.empty())
|
||||
return ' ';
|
||||
auto first{static_cast<char>(::tolower(esc.at(0)))};
|
||||
auto rest=esc.substr(1);
|
||||
if (seq_some(escs, [&](auto&& e){return starts_with(rest, e);}))
|
||||
return first;
|
||||
else
|
||||
return ' ';
|
||||
};
|
||||
|
||||
size_t start_pos{ctx.position()};
|
||||
while (!ctx.done()) {
|
||||
if (ctx.looking_at(";")) {
|
||||
auto esc = ctx.substr(start_pos, ctx.position() - start_pos);
|
||||
ctx.raw_scraped() += unescape(esc);
|
||||
ctx.advance();
|
||||
return;
|
||||
}
|
||||
ctx.advance();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// a block of text to be scraped
|
||||
static void
|
||||
text(Context& ctx)
|
||||
{
|
||||
size_t start_pos{ctx.position()};
|
||||
while (!ctx.done()) {
|
||||
|
||||
if (ctx.looking_at("&")) {
|
||||
|
||||
ctx.raw_scraped() += ctx.substr(start_pos,
|
||||
ctx.position() - start_pos);
|
||||
ctx.advance();
|
||||
html_escape_char(ctx);
|
||||
start_pos = ctx.position();
|
||||
|
||||
} else if (ctx.looking_at("<")) {
|
||||
ctx.raw_scraped() += ctx.substr(start_pos,
|
||||
ctx.position() - start_pos);
|
||||
ctx.advance();
|
||||
tag(ctx);
|
||||
start_pos = ctx.position();
|
||||
|
||||
} else
|
||||
ctx.advance();
|
||||
}
|
||||
|
||||
ctx.raw_scraped() += ctx.substr(start_pos, ctx.size() - start_pos);
|
||||
}
|
||||
|
||||
static Context *CTX{};
|
||||
|
||||
std::string
|
||||
Mu::html_to_text(const std::string& html)
|
||||
{
|
||||
Context ctx{html};
|
||||
CTX = &ctx;
|
||||
|
||||
text(ctx);
|
||||
|
||||
CTX = {};
|
||||
return ctx.scraped();
|
||||
}
|
||||
|
||||
#ifdef BUILD_TESTS
|
||||
#include "mu-test-utils.hh"
|
||||
|
||||
static void
|
||||
test_1()
|
||||
{
|
||||
static std::vector<std::pair<std::string, std::string>>
|
||||
tests = {
|
||||
{ "<!-- Hello -->A", "A" },
|
||||
{ "A<!-- Test -->B", "A B" },
|
||||
{ "A<i>a</i><b>p</b>", "Aap"},
|
||||
{ "N&ocute;Ôt", "Noot"},
|
||||
{
|
||||
"foo<!-- bar --><i>c</i>uu<bla>x</bla>"
|
||||
"<!--hello -->world<!--",
|
||||
"foo cuu x world"
|
||||
}
|
||||
};
|
||||
|
||||
for (auto&& test: tests)
|
||||
assert_equal(html_to_text(test.first), test.second);
|
||||
}
|
||||
|
||||
static void
|
||||
test_2()
|
||||
{
|
||||
static std::vector<std::pair<std::string, std::string>>
|
||||
tests = {
|
||||
{ R"(<i>hello, <b bar="/b">world!</b>)",
|
||||
"hello, world!"},
|
||||
};
|
||||
|
||||
for (auto&& test: tests)
|
||||
assert_equal(html_to_text(test.first), test.second);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_3()
|
||||
{
|
||||
static std::vector<std::pair<std::string, std::string>>
|
||||
tests = {
|
||||
{R"(<i>hello, </i><script language="javascript">
|
||||
function foo() {
|
||||
alert("Stroopwafel!"); // test
|
||||
}
|
||||
</script>world!)",
|
||||
"hello, world!"},
|
||||
};
|
||||
|
||||
for (auto&& test: tests)
|
||||
assert_equal(html_to_text(test.first), test.second);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char* argv[])
|
||||
{
|
||||
mu_test_init(&argc, &argv);
|
||||
|
||||
g_test_add_func("/html-to-text/test-1", test_1);
|
||||
g_test_add_func("/html-to-text/test-2", test_2);
|
||||
g_test_add_func("/html-to-text/test-3", test_3);
|
||||
|
||||
return g_test_run();
|
||||
}
|
||||
|
||||
|
||||
#endif /*BUILD_TESTS*/
|
||||
|
||||
|
||||
#ifdef BUILD_HTML_TO_TEXT
|
||||
|
||||
#include "mu-utils-file.hh"
|
||||
|
||||
// simple tool that reads html on stdin and outputs text on stdout
|
||||
// e.g. curl --silent https://www.example.com | build/lib/utils/mu-html2text
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
auto res = read_from_stdin();
|
||||
if (!res) {
|
||||
mu_printerrln("error reading from stdin: {}", res.error().what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
mu_println("{}", html_to_text(*res));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /*BUILD_HTML_TO_TEXT*/
|
|
@ -265,6 +265,16 @@ std::string date_to_time_t_string(int64_t t);
|
|||
*/
|
||||
std::string time_to_string(const char *frm, time_t t, bool utc = false) G_GNUC_CONST;
|
||||
|
||||
/**
|
||||
* Crudely convert HTML to plain text. This attempts to scrape the
|
||||
* human-readable text from html-email so we can use it for indexing.
|
||||
*
|
||||
* @param html html
|
||||
*
|
||||
* @return plain text
|
||||
*/
|
||||
std::string html_to_text(const std::string& html);
|
||||
|
||||
/**
|
||||
* Hack to avoid locale crashes
|
||||
*
|
||||
|
|
|
@ -153,22 +153,10 @@ static void
|
|||
test_mu_find_02(void)
|
||||
{
|
||||
/* when matching html as if it were text,
|
||||
* 'bull' is also matched in arto.eml, •
|
||||
*/
|
||||
// search("bull", 1);
|
||||
// search("bull m:foo", 0);
|
||||
// search("bull m:/foo", 1);
|
||||
// search("bull m:/Foo", 1);
|
||||
// search("bull flag:attach", 1);
|
||||
// search("bull flag:a", 1);
|
||||
|
||||
search("bull", 2);
|
||||
search("bull m:foo", 0);
|
||||
search("bull m:/foo", 2);
|
||||
search("bull m:/Foo", 2);
|
||||
search("bull flag:attach", 1);
|
||||
search("bull flag:a", 1);
|
||||
* 'bull' is also matched in arto.eml, • however,
|
||||
* we don't do that anymore! */
|
||||
|
||||
search("bull", 1);
|
||||
|
||||
search("g:x", 0);
|
||||
search("flag:encrypted", 0);
|
||||
|
|
Loading…
Reference in New Issue