lib: support 'personal' regexp, move to mu-contacts

Move the determination of "personal" to MuContacts; add support for
regexps (POSIX-basic, in //)
This commit is contained in:
Dirk-Jan C. Binnema 2020-10-13 23:38:26 +03:00
parent 5cd6226ebd
commit dbff5671dd
7 changed files with 186 additions and 99 deletions

View File

@ -25,6 +25,7 @@
#include <sstream>
#include <functional>
#include <algorithm>
#include <regex>
#include <utils/mu-utils.hh>
#include <glib.h>
@ -34,7 +35,21 @@ using namespace Mu;
ContactInfo::ContactInfo (const std::string& _full_address,
const std::string& _email,
const std::string& _name,
bool _personal, time_t _last_seen, size_t _freq):
time_t _last_seen):
full_address{_full_address},
email{_email},
name{_name},
last_seen{_last_seen},
freq{1},
tstamp{g_get_monotonic_time()} {}
ContactInfo::ContactInfo (const std::string& _full_address,
const std::string& _email,
const std::string& _name,
bool _personal,
time_t _last_seen,
size_t _freq):
full_address{_full_address},
email{_email},
name{_name},
@ -43,7 +58,6 @@ ContactInfo::ContactInfo (const std::string& _full_address,
freq{_freq},
tstamp{g_get_monotonic_time()} {}
struct EmailHash {
std::size_t operator()(const std::string& email) const {
std::size_t djb = 5381; // djb hash
@ -95,19 +109,55 @@ using ContactUMap = std::unordered_map<const std::string, ContactInfo, EmailHash
using ContactSet = std::set<std::reference_wrapper<const ContactInfo>, ContactInfoLessThan>;
struct Contacts::Private {
Private(const std::string& serialized):
contacts_{deserialize(serialized)}
{}
Private(const std::string& serialized,
const StringVec& personal):
contacts_{deserialize(serialized)} {
make_personal(personal);
}
void make_personal(const StringVec& personal);
ContactUMap deserialize(const std::string&) const;
std::string serialize() const;
ContactUMap contacts_;
std::mutex mtx_;
StringVec personal_plain_;
std::vector<std::regex> personal_rx_;
};
constexpr auto Separator = "\xff"; // Invalid in UTF-8
void
Contacts::Private::make_personal (const StringVec& personal)
{
for (auto&& p: personal) {
if (p.empty())
continue; // invalid
if (p.size() < 2 || p.at(0) != '/' || p.at(p.length() - 1) != '/')
personal_plain_.emplace_back(p); // normal address
else {
// a regex pattern.
try {
const auto rxstr{p.substr(1, p.length()-2)};
personal_rx_.emplace_back(
std::regex(rxstr,
std::regex::basic |
std::regex::optimize |
std::regex::icase));
} catch (const std::regex_error& rex) {
g_warning ("invalid personal address regexp '%s': %s",
p.c_str(), rex.what());
}
}
}
}
ContactUMap
Contacts::Private::deserialize(const std::string& serialized) const
{
@ -131,15 +181,14 @@ Contacts::Private::deserialize(const std::string& serialized) const
(std::size_t)g_ascii_strtoll(parts[5].c_str(), NULL, 10)); // freq
contacts.emplace(std::move(parts[1]), std::move(ci));
}
return contacts;
}
Contacts::Contacts (const std::string& serialized) :
priv_{std::make_unique<Private>(serialized)}
Contacts::Contacts (const std::string& serialized, const StringVec& personal) :
priv_{std::make_unique<Private>(serialized, personal)}
{}
Contacts::~Contacts() = default;
@ -170,44 +219,42 @@ Contacts::serialize() const
}
// for now, we only care about _not_ having newlines.
static void
wash (std::string& str)
{
str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
}
void
Contacts::add (ContactInfo&& ci)
{
std::lock_guard<std::mutex> l_{priv_->mtx_};
auto down = g_ascii_strdown (ci.email.c_str(), -1);
std::string email{down};
g_free(down);
auto it = priv_->contacts_.find(ci.email);
auto it = priv_->contacts_.find(email);
if (it != priv_->contacts_.end()) {
auto& ci2 = it->second;
++ci2.freq;
if (ci.last_seen > ci2.last_seen) {
ci2.last_seen = ci.last_seen;
wash(ci.email);
ci2.email = std::move(ci.email);
if (!ci.name.empty()) {
wash(ci.name);
ci2.name = std::move(ci.name);
}
if (it == priv_->contacts_.end()) { // completely new contact
wash(ci.name);
wash(ci.full_address);
ci.freq = 1;
ci.personal = is_personal(ci.email);
auto email{ci.email};
priv_->contacts_.emplace(ContactUMap::value_type(email, std::move(ci)));
} else { // existing contact.
auto& ci_existing{it->second};
++ci_existing.freq;
if (ci.last_seen > ci_existing.last_seen) {
// update.
wash(ci.name);
ci_existing.name = std::move(ci.name);
ci_existing.email = std::move(ci.email);
wash(ci.full_address);
ci_existing.full_address = std::move(ci.full_address);
ci_existing.tstamp = g_get_monotonic_time();
}
}
wash(ci.name);
wash(ci.email);
wash(ci.full_address);
priv_->contacts_.emplace(
ContactUMap::value_type(std::move(email), std::move(ci)));
}
@ -216,8 +263,7 @@ Contacts::_find (const std::string& email) const
{
std::lock_guard<std::mutex> l_{priv_->mtx_};
ContactInfo ci{"", email, "", false, 0};
const auto it = priv_->contacts_.find(ci.email);
const auto it = priv_->contacts_.find(email);
if (it == priv_->contacts_.end())
return {};
else
@ -260,6 +306,23 @@ Contacts::for_each(const EachContactFunc& each_contact) const
each_contact (ci);
}
bool
Contacts::is_personal(const std::string& addr) const
{
for (auto&& p: priv_->personal_plain_)
if (g_ascii_strcasecmp(addr.c_str(), p.c_str()) == 0)
return true;
for (auto&& rx: priv_->personal_rx_) {
std::smatch m; // perhaps cache addr in personal_plain_?
if (std::regex_match(addr, m, rx))
return true;
}
return false;
}
/// C binding
size_t

View File

@ -34,6 +34,7 @@ typedef struct _MuContacts MuContacts;
#include <string>
#include <time.h>
#include <inttypes.h>
#include <utils/mu-utils.hh>
namespace Mu {
@ -46,25 +47,38 @@ struct ContactInfo {
* @param _full_address the full email address + name.
* @param _email email address
* @param _name name or empty
* @param _personal is this a personal contact?
* @param _last_seen when was this contact last seen?
* @param _freq how often was this contact seen?
*
* @return
*/
ContactInfo (const std::string& _full_address,
const std::string& _email,
const std::string& _name,
bool _personal, time_t _last_seen, size_t _freq=1);
time_t _last_seen);
/**
* Construct a new ContactInfo
*
* @param _full_address the full email address + name.
* @param _email email address
* @param _name name or empty
* @param _personal is this a personal contact?
* @param _last_seen when was this contact last seen?
* @param _freq how often was this contact seen?
*/
ContactInfo (const std::string& _full_address,
const std::string& _email,
const std::string& _name,
bool personal,
time_t _last_seen,
size_t freq);
std::string full_address; /**< Full name <email> */
std::string email; /**< email address */
std::string name; /**< name (or empty) */
bool personal; /**< is this a personal contact? */
time_t last_seen; /**< when was this contact last seen? */
std::size_t freq; /**< how often was this contact seen? */
bool personal{}; /**< is this a personal contact? */
time_t last_seen{}; /**< when was this contact last seen? */
std::size_t freq{}; /**< how often was this contact seen? */
int64_t tstamp; /**< Time-stamp, as per g_get_monotonic_time */
int64_t tstamp{}; /**< Time-stamp, as per g_get_monotonic_time */
};
/// All contacts
@ -74,8 +88,10 @@ public:
* Construct a new contacts objects
*
* @param serialized serialized contacts
* @param personal personal addresses
*/
Contacts (const std::string& serialized = "");
Contacts (const std::string& serialized = "",
const StringVec& personal={});
/**
* DTOR
@ -118,6 +134,16 @@ public:
*/
std::string serialize() const;
/**
* Does this look like a 'personal' address?
*
* @param addr some e-mail address
*
* @return true or false
*/
bool is_personal(const std::string& addr) const;
/**
* Find a contact based on the email address. This is not safe, since
* the returned ptr can be invalidated at any time; only for unit-tests.

View File

@ -114,7 +114,7 @@ struct Store::Private {
Private (const std::string& path, bool readonly):
db_{make_xapian(path, readonly ? XapianOpts::ReadOnly : XapianOpts::Open)},
mdata_{make_metadata(path)},
contacts_{db()->get_metadata(ContactsKey)} {
contacts_{db()->get_metadata(ContactsKey), mdata_.personal_addresses} {
if (!readonly)
wdb()->begin_transaction();
@ -123,7 +123,8 @@ struct Store::Private {
Private (const std::string& path, const std::string& root_maildir,
const StringVec& personal_addresses, const Store::Config& conf):
db_{make_xapian(path, XapianOpts::CreateOverwrite)},
mdata_{init_metadata(conf, path, root_maildir, personal_addresses)} {
mdata_{init_metadata(conf, path, root_maildir, personal_addresses)},
contacts_{"", mdata_.personal_addresses} {
wdb()->begin_transaction();
}
@ -307,7 +308,6 @@ Store::metadata() const
const Contacts&
Store::contacts() const
{
LOCKED;
return priv_->contacts_;
}
@ -1045,32 +1045,11 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
contacts.add(Mu::ContactInfo(contact->full_address,
contact->email,
contact->name ? contact->name : "",
msgdoc->_personal,
mu_msg_get_date(msgdoc->_msg)));
}
return TRUE;
}
static gboolean
each_contact_check_if_personal (MuMsgContact *contact, MsgDoc *msgdoc)
{
if (msgdoc->_personal || !contact->email)
return TRUE;
for (const auto& cur : *msgdoc->_my_addresses) {
if (g_ascii_strcasecmp
(contact->email,
(const char*)cur.c_str()) == 0) {
msgdoc->_personal = TRUE;
break;
}
}
return TRUE;
}
static Xapian::Document
new_doc_from_message (MuStore *store, MuMsg *msg)
{
@ -1079,17 +1058,20 @@ new_doc_from_message (MuStore *store, MuMsg *msg)
mu_msg_field_foreach ((MuMsgFieldForeachFunc)add_terms_values, &docinfo);
/* determine whether this is 'personal' email, ie. one of my
* e-mail addresses is explicitly mentioned -- it's not a
* mailing list message. Callback will update docinfo->_personal */
const auto& personal_addresses = self(store)->metadata().personal_addresses;
if (personal_addresses.size()) {
docinfo._my_addresses = &personal_addresses;
mu_msg_contact_foreach
(msg,
(MuMsgContactForeachFunc)each_contact_check_if_personal,
&docinfo);
}
mu_msg_contact_foreach
(msg, [](auto contact, gpointer msgdocptr)->gboolean {
auto msgdoc{reinterpret_cast<MsgDoc*>(msgdocptr)};
if (!contact->email)
return FALSE; // invalid contact
else if (msgdoc->_personal)
return TRUE; // already deemed personal
if (msgdoc->_store->contacts().is_personal(contact->email))
msgdoc->_personal = true; // this one's personal.
return TRUE;
}, &docinfo);
/* also store the contact-info as separate terms, and add it
* to the cache */

View File

@ -96,8 +96,6 @@ public:
* @return the metadata
*/
const Metadata& metadata() const;
/**
* Get the Contacts object for this store
*
@ -105,7 +103,6 @@ public:
*/
const Contacts& contacts() const;
/**
* Get the Indexer associated with this store. It is an error
* to call this on a read-only store.
@ -177,7 +174,6 @@ public:
*/
bool contains_message (const std::string& path) const;
/**
* Prototype for the ForEachFunc
*

View File

@ -33,25 +33,21 @@ test_mu_contacts_01()
g_assert_cmpuint (contacts.size(), ==, 0);
contacts.add(std::move(Mu::ContactInfo ("Foo <foo.bar@example.com>",
"foo.bar@example.com", "Foo",
false, 12345)));
"foo.bar@example.com", "Foo", 12345)));
g_assert_false (contacts.empty());
g_assert_cmpuint (contacts.size(), ==, 1);
contacts.add(std::move(Mu::ContactInfo ("Cuux <cuux.fnorb@example.com>",
"cuux@example.com", "Cuux", true,
54321)));
"cuux@example.com", "Cuux", 54321)));
g_assert_cmpuint (contacts.size(), ==, 2);
contacts.add(std::move(Mu::ContactInfo ("foo.bar@example.com",
"foo.bar@example.com", "Foo",
false, 77777)));
"foo.bar@example.com", "Foo", 77777)));
g_assert_cmpuint (contacts.size(), ==, 2);
contacts.add(std::move(Mu::ContactInfo ("Foo.Bar@Example.Com",
"Foo.Bar@Example.Com", "Foo",
false, 88888)));
"Foo.Bar@Example.Com", "Foo", 88888)));
g_assert_cmpuint (contacts.size(), ==, 2);
// note: replaces first.
@ -60,7 +56,6 @@ test_mu_contacts_01()
g_assert_false (info);
}
{
const auto info = contacts._find("foo.BAR@example.com");
g_assert_true (info);
@ -73,6 +68,27 @@ test_mu_contacts_01()
g_assert_cmpuint (contacts.size(), ==, 0);
}
static void
test_mu_contacts_02()
{
Mu::StringVec personal = {
"foo@example.com",
"bar@cuux.org",
"/bar-.*@fnorb.f./"
};
Mu::Contacts contacts{"", personal};
g_assert_true (contacts.is_personal("foo@example.com"));
g_assert_true (contacts.is_personal("Bar@CuuX.orG"));
g_assert_true (contacts.is_personal("bar-123abc@fnorb.fi"));
g_assert_true (contacts.is_personal("bar-zzz@fnorb.fr"));
g_assert_false (contacts.is_personal("foo@bar.com"));
g_assert_false (contacts.is_personal("BÂr@CuuX.orG"));
g_assert_false (contacts.is_personal("bar@fnorb.fi"));
g_assert_false (contacts.is_personal("bar-zzz@fnorb.xr"));
}
int
@ -81,6 +97,7 @@ main (int argc, char *argv[])
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/mu-contacts/01", test_mu_contacts_01);
g_test_add_func ("/mu-contacts/02", test_mu_contacts_02);
g_log_set_handler (NULL,
(GLogLevelFlags)

View File

@ -79,7 +79,6 @@ test_store_add_count_remove ()
}
int
main (int argc, char *argv[])
{

View File

@ -1,4 +1,4 @@
.TH MU-INIT 1 "February 2020" "User Manuals"
.TH MU-INIT 1 "October 2020" "User Manuals"
.SH NAME
@ -10,13 +10,14 @@ mu init \- initialize the mu message database
.SH DESCRIPTION
\fBmu init\fR is the \fBmu\fR command for setting up the mu message
database. After \fBmu init\fR has completed, you can run \fBmu index\fR
\fBmu init\fR is the subcommand for setting up the mu message
database. After \fBmu init\fR has completed, you can run \fBmu
index\fR
.SH OPTIONS
Note, some of the general options are described in the \fBmu(1)\fR man-page and
not here, as they apply to multiple mu commands.
Note, some of the general options are described in the \fBmu(1)\fR
man-page and not here, as they apply to multiple mu commands.
.TP
\fB\-\-muhome\fR
@ -34,7 +35,6 @@ are not supported.
.TP
\fB\-\-my-address\fR=\fI<my-email-address>\fR
specifies that some e-mail addresses are 'my-address' (\fB\-\-my-address\fR can
be used multiple times). This is used by \fBmu cfind\fR -- any e-mail address
found in the address fields of a message which also has \fI<my-email-address>\fR
@ -42,6 +42,10 @@ in one of its address fields is considered a \fIpersonal\fR e-mail address. This
allows you, for example, to filter out (\fBmu cfind --personal\fR) addresses
which were merely seen in mailing list messages.
\fI<my-email-address>\fR can be either a plain e-mail address (such as
\fBfoo@example.com\fR), or a regular-expression (of the 'Basic POSIX'
flavor), wrapped in \B/\fR (such as \B/foo-.*@example\\.com\fR).
.SH ENVIRONMENT
\fBmu init\fR uses \fBMAILDIR\fR to find the user's Maildir if it has not been