From 7f2eeb1010b7cadc604afc9128df01af62a109dd Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Thu, 11 May 2023 23:22:29 +0300 Subject: [PATCH] message: try to detect body text language Try to detect the language of the e-mail body and make it searchable. --- lib/message/mu-fields.cc | 1 - lib/message/mu-fields.hh | 13 ++++++++++++- lib/message/mu-message.cc | 19 +++++++++++++++---- lib/message/mu-message.hh | 5 ++--- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/lib/message/mu-fields.cc b/lib/message/mu-fields.cc index 6727e573..d5ce4d23 100644 --- a/lib/message/mu-fields.cc +++ b/lib/message/mu-fields.cc @@ -165,7 +165,6 @@ test_field_from_name() Field::Id::Bcc); } - static void test_xapian_term() { diff --git a/lib/message/mu-fields.hh b/lib/message/mu-fields.hh index 6957c2b9..c9b56285 100644 --- a/lib/message/mu-fields.hh +++ b/lib/message/mu-fields.hh @@ -52,6 +52,7 @@ struct Field { File, /**< Filename */ Flags, /**< Message flags */ From, /**< Message sender */ + Language, /**< Body language */ Maildir, /**< Maildir path */ MailingList, /**< Mailing list */ MessageId, /**< Message Id */ @@ -252,7 +253,6 @@ static constexpr std::array Field::Flag::IncludeInSexp | Field::Flag::IndexableTerm, }, - { Field::Id::Changed, Field::Type::TimeT, @@ -316,6 +316,17 @@ static constexpr std::array Field::Flag::IncludeInSexp | Field::Flag::IndexableTerm, }, + { + Field::Id::Language, + Field::Type::String, + "language", "lang", + "ISO 639-1 language code for body", + "lang:nl", + 'a', + Field::Flag::BooleanTerm | + Field::Flag::Value | + Field::Flag::IncludeInSexp + }, { Field::Id::Maildir, Field::Type::String, diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index 4e2c2e13..06335f9d 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,8 @@ struct Message::Private { Option body_txt; Option body_html; Option embedded; + + Option language; /* body ISO language code */ }; @@ -531,6 +534,14 @@ process_message(const MimeMessage& mime_msg, const std::string& path, info.mailing_list = get_mailing_list(mime_msg); if (info.mailing_list) info.flags |= Flags::MailingList; + + if (info.body_txt) { /* attempt to get the body-language */ + if (const auto lang{detect_language(info.body_txt.value())}; lang) { + info.language = lang->code; + g_debug("detected language: %s", lang->code); + } else + g_debug("could not detect language"); + } } static Mu::Result @@ -586,8 +597,6 @@ fake_message_id(const std::string& path) * based on a field. So we add them here. */ - - static void doc_add_list_post(Document& doc, const MimeMessage& mime_msg) { @@ -643,7 +652,7 @@ fill_document(Message::Private& priv) doc_add_reply_to(doc, mime_msg); /* only in sexp */ field_for_each([&](auto&& field) { - /* insist on expliclity handling each */ + /* insist on explicitly handling each */ #pragma GCC diagnostic push #pragma GCC diagnostic error "-Wswitch" switch(field.id) { @@ -652,7 +661,6 @@ fill_document(Message::Private& priv) break; case Field::Id::BodyText: doc.add(field.id, priv.body_txt); - break; case Field::Id::Cc: doc.add(field.id, mime_msg.contacts(Contact::Type::Cc)); @@ -676,6 +684,9 @@ fill_document(Message::Private& priv) case Field::Id::From: doc.add(field.id, mime_msg.contacts(Contact::Type::From)); break; + case Field::Id::Language: + doc.add(field.id, priv.language); + break; case Field::Id::Maildir: /* already */ break; case Field::Id::MailingList: diff --git a/lib/message/mu-message.hh b/lib/message/mu-message.hh index 893b0a87..aff25716 100644 --- a/lib/message/mu-message.hh +++ b/lib/message/mu-message.hh @@ -110,7 +110,6 @@ public: } /* LCOV_EXCL_STOP */ - /** * Construct a message from a string. This is mostly useful for testing. * @@ -406,8 +405,8 @@ public: const std::vector& parts() const; /** - * Get the path to a cche directory for this message, which - * is useful for temporarily saving attachments + * Get the path to a cache directory for this message, which is useful + * for temporarily saving attachments * * @param index optionally, create / instead; * this is useful for having part-specific subdirectories.