message: try to detect body text language

Try to detect the language of the e-mail body and make it searchable.
This commit is contained in:
Dirk-Jan C. Binnema 2023-05-11 23:22:29 +03:00
parent ad64093183
commit 7f2eeb1010
4 changed files with 29 additions and 9 deletions

View File

@ -165,7 +165,6 @@ test_field_from_name()
Field::Id::Bcc);
}
static void
test_xapian_term()
{

View File

@ -52,6 +52,7 @@ struct Field {
File, /**< Filename */
Flags, /**< Message flags */
From, /**< Message sender */
Language, /**< Body language */
Maildir, /**< Maildir path */
MailingList, /**< Mailing list */
MessageId, /**< Message Id */
@ -252,7 +253,6 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm,
},
{
Field::Id::Changed,
Field::Type::TimeT,
@ -316,6 +316,17 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm,
},
{
Field::Id::Language,
Field::Type::String,
"language", "lang",
"ISO 639-1 language code for body",
"lang:nl",
'a',
Field::Flag::BooleanTerm |
Field::Flag::Value |
Field::Flag::IncludeInSexp
},
{
Field::Id::Maildir,
Field::Type::String,

View File

@ -29,6 +29,7 @@
#include <utils/mu-utils.hh>
#include <utils/mu-error.hh>
#include <utils/mu-option.hh>
#include <utils/mu-lang-detector.hh>
#include <atomic>
#include <mutex>
@ -67,6 +68,8 @@ struct Message::Private {
Option<std::string> body_txt;
Option<std::string> body_html;
Option<std::string> embedded;
Option<std::string> language; /* body ISO language code */
};
@ -531,6 +534,14 @@ process_message(const MimeMessage& mime_msg, const std::string& path,
info.mailing_list = get_mailing_list(mime_msg);
if (info.mailing_list)
info.flags |= Flags::MailingList;
if (info.body_txt) { /* attempt to get the body-language */
if (const auto lang{detect_language(info.body_txt.value())}; lang) {
info.language = lang->code;
g_debug("detected language: %s", lang->code);
} else
g_debug("could not detect language");
}
}
static Mu::Result<std::string>
@ -586,8 +597,6 @@ fake_message_id(const std::string& path)
* based on a field. So we add them here.
*/
static void
doc_add_list_post(Document& doc, const MimeMessage& mime_msg)
{
@ -643,7 +652,7 @@ fill_document(Message::Private& priv)
doc_add_reply_to(doc, mime_msg); /* only in sexp */
field_for_each([&](auto&& field) {
/* insist on expliclity handling each */
/* insist on explicitly handling each */
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
switch(field.id) {
@ -652,7 +661,6 @@ fill_document(Message::Private& priv)
break;
case Field::Id::BodyText:
doc.add(field.id, priv.body_txt);
break;
case Field::Id::Cc:
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -676,6 +684,9 @@ fill_document(Message::Private& priv)
case Field::Id::From:
doc.add(field.id, mime_msg.contacts(Contact::Type::From));
break;
case Field::Id::Language:
doc.add(field.id, priv.language);
break;
case Field::Id::Maildir: /* already */
break;
case Field::Id::MailingList:

View File

@ -110,7 +110,6 @@ public:
}
/* LCOV_EXCL_STOP */
/**
* Construct a message from a string. This is mostly useful for testing.
*
@ -406,8 +405,8 @@ public:
const std::vector<Part>& parts() const;
/**
* Get the path to a cche directory for this message, which
* is useful for temporarily saving attachments
* Get the path to a cache directory for this message, which is useful
* for temporarily saving attachments
*
* @param index optionally, create <cache-path>/<index> instead;
* this is useful for having part-specific subdirectories.