From abfa6f277c70392dee51206b7a8b0a3e93af028d Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Tue, 31 Jan 2023 23:12:05 +0200 Subject: [PATCH] mu: index html text as if it were plain text This is a bit of hack to include html text in results. Of course, html text is not really plain text, so this is a bit of a hack until we introduce some html parsing step. --- lib/message/mu-document.cc | 13 +++++++- lib/tests/test-mu-store-query.cc | 52 ++++++++++++++++++++++++++++++++ mu/tests/test-mu-cmd.cc | 33 +++++++++++--------- 3 files changed, 83 insertions(+), 15 deletions(-) diff --git a/lib/message/mu-document.cc b/lib/message/mu-document.cc index 3c43aa0f..ee366c3e 100644 --- a/lib/message/mu-document.cc +++ b/lib/message/mu-document.cc @@ -83,6 +83,16 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va throw std::logic_error("not a search term"); } +/* hack... import html text as if it were plain text. */ +static void +add_body_html(Xapian::Document& doc, const Field& field, const std::string& val) +{ + static Field body_field = field_from_id(Field::Id::BodyText); + + Xapian::TermGenerator termgen; + termgen.set_document(doc); + termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term()); +} void Document::add(Field::Id id, const std::string& val) @@ -94,7 +104,8 @@ Document::add(Field::Id id, const std::string& val) if (field.is_searchable()) add_search_term(xdoc_, field, val); - + else if (id == Field::Id::XBodyHtml) + add_body_html(xdoc_, field, val); if (field.include_in_sexp()) { put_prop(field, val); } diff --git a/lib/tests/test-mu-store-query.cc b/lib/tests/test-mu-store-query.cc index e2b8f13c..ea43108a 100644 --- a/lib/tests/test-mu-store-query.cc +++ b/lib/tests/test-mu-store-query.cc @@ -718,9 +718,60 @@ Boo! assert_valid_result(qr); g_assert_cmpuint(qr->size(), ==, 3); } +} + + +static void +test_html() +{ + // test message sent to self, and copy of received msg. + + const auto test_msg = R"(From: Test +To: abc@example.com +Date: Mon, 23 May 2011 10:53:45 +0200 +Subject: vla +MIME-Version: 1.0 +Content-Type: multipart/alternative; + boundary="_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d" +Message-ID: <10374608.109906.11909.20115aabbccdd.MSGID@mailinglijst.nl> + +--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d +Content-Type: text/plain; charset="iso-8859-15" +Content-Transfer-Encoding: quoted-printable + +text + +--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d +Content-Type: text/html; charset="iso-8859-15" +Content-Transfer-Encoding: quoted-printable + +html + +--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d-- +)"; + const TestMap test_msgs = {{"inbox/cur/msg1", test_msg }}; + + TempDir tdir; + auto store{make_test_store(tdir.path(), test_msgs, {})}; + g_assert_cmpuint(store.size(), ==, 1); + + { + auto qr = store.run_query("body:text", Field::Id::Date, + QueryFlags::None); + assert_valid_result(qr); + g_assert_cmpuint(qr->size(), ==, 1); + } + + { + auto qr = store.run_query("body:html", Field::Id::Date, + QueryFlags::None); + assert_valid_result(qr); + g_assert_cmpuint(qr->size(), ==, 1); + } } + int main(int argc, char* argv[]) { @@ -745,6 +796,7 @@ main(int argc, char* argv[]) test_term_split); g_test_add_func("/store/query/related-dup-threaded", test_related_dup_threaded); + g_test_add_func("/store/query/html", test_html); return g_test_run(); } diff --git a/mu/tests/test-mu-cmd.cc b/mu/tests/test-mu-cmd.cc index 9693538c..d70cc80d 100644 --- a/mu/tests/test-mu-cmd.cc +++ b/mu/tests/test-mu-cmd.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2008-2022 Dirk-Jan C. Binnema +** Copyright (C) 2008-2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the @@ -150,15 +150,29 @@ test_mu_find_01(void) static void test_mu_find_02(void) { - search("bull", 1); + /* when matching html as if it were text, + * 'bull' is also matched in arto.eml, • + */ + // search("bull", 1); + // search("bull m:foo", 0); + // search("bull m:/foo", 1); + // search("bull m:/Foo", 1); + // search("bull flag:attach", 1); + // search("bull flag:a", 1); + + search("bull", 2); search("bull m:foo", 0); - search("bull m:/foo", 1); - search("bull m:/Foo", 1); + search("bull m:/foo", 2); + search("bull m:/Foo", 2); search("bull flag:attach", 1); search("bull flag:a", 1); + + search("g:x", 0); search("flag:encrypted", 0); search("flag:attach", 1); + + search("i:3BE9E6535E0D852173@emss35m06.us.lmco.com", 1); } static void @@ -191,17 +205,9 @@ test_mu_find_text_in_rfc822(void) } /* some more tests */ -static void -test_mu_find_03(void) -{ - search("bull", 1); - search("bull m:foo", 0); - search("bull m:/foo", 1); - search("i:3BE9E6535E0D852173@emss35m06.us.lmco.com", 1); -} static void /* error cases */ -test_mu_find_04(void) +test_mu_find_03() { gchar *cmdline, *erroutput; @@ -840,7 +846,6 @@ main(int argc, char* argv[]) g_test_add_func("/mu-cmd/test-mu-find-text-in-rfc822", test_mu_find_text_in_rfc822); g_test_add_func("/mu-cmd/test-mu-find-03", test_mu_find_03); - g_test_add_func("/mu-cmd/test-mu-find-04", test_mu_find_04); g_test_add_func("/mu-cmd/test-mu-find-maildir-special", test_mu_find_maildir_special); g_test_add_func("/mu-cmd/test-mu-extract-01", test_mu_extract_01); g_test_add_func("/mu-cmd/test-mu-extract-02", test_mu_extract_02);