From dbff5671dd5cc3d01f6d3e7644166a4db0208a8f Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Tue, 13 Oct 2020 23:38:26 +0300 Subject: [PATCH] lib: support 'personal' regexp, move to mu-contacts Move the determination of "personal" to MuContacts; add support for regexps (POSIX-basic, in //) --- lib/mu-contacts.cc | 131 +++++++++++++++++++++++++++++----------- lib/mu-contacts.hh | 46 +++++++++++--- lib/mu-store.cc | 52 ++++++---------- lib/mu-store.hh | 4 -- lib/test-mu-contacts.cc | 35 ++++++++--- lib/test-mu-store.cc | 1 - man/mu-init.1 | 16 +++-- 7 files changed, 186 insertions(+), 99 deletions(-) diff --git a/lib/mu-contacts.cc b/lib/mu-contacts.cc index e5bb21f6..b59e0590 100644 --- a/lib/mu-contacts.cc +++ b/lib/mu-contacts.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,21 @@ using namespace Mu; ContactInfo::ContactInfo (const std::string& _full_address, const std::string& _email, const std::string& _name, - bool _personal, time_t _last_seen, size_t _freq): + time_t _last_seen): + full_address{_full_address}, + email{_email}, + name{_name}, + last_seen{_last_seen}, + freq{1}, + tstamp{g_get_monotonic_time()} {} + + +ContactInfo::ContactInfo (const std::string& _full_address, + const std::string& _email, + const std::string& _name, + bool _personal, + time_t _last_seen, + size_t _freq): full_address{_full_address}, email{_email}, name{_name}, @@ -43,7 +58,6 @@ ContactInfo::ContactInfo (const std::string& _full_address, freq{_freq}, tstamp{g_get_monotonic_time()} {} - struct EmailHash { std::size_t operator()(const std::string& email) const { std::size_t djb = 5381; // djb hash @@ -95,19 +109,55 @@ using ContactUMap = std::unordered_map, ContactInfoLessThan>; struct Contacts::Private { - Private(const std::string& serialized): - contacts_{deserialize(serialized)} - {} + Private(const std::string& serialized, + const StringVec& personal): + contacts_{deserialize(serialized)} { + make_personal(personal); + } + void make_personal(const StringVec& personal); ContactUMap deserialize(const std::string&) const; std::string serialize() const; ContactUMap contacts_; std::mutex mtx_; + + StringVec personal_plain_; + std::vector personal_rx_; }; constexpr auto Separator = "\xff"; // Invalid in UTF-8 +void +Contacts::Private::make_personal (const StringVec& personal) +{ + for (auto&& p: personal) { + + if (p.empty()) + continue; // invalid + + if (p.size() < 2 || p.at(0) != '/' || p.at(p.length() - 1) != '/') + personal_plain_.emplace_back(p); // normal address + else { + // a regex pattern. + try { + const auto rxstr{p.substr(1, p.length()-2)}; + personal_rx_.emplace_back( + std::regex(rxstr, + std::regex::basic | + std::regex::optimize | + std::regex::icase)); + + } catch (const std::regex_error& rex) { + g_warning ("invalid personal address regexp '%s': %s", + p.c_str(), rex.what()); + } + } + } +} + + + ContactUMap Contacts::Private::deserialize(const std::string& serialized) const { @@ -131,15 +181,14 @@ Contacts::Private::deserialize(const std::string& serialized) const (std::size_t)g_ascii_strtoll(parts[5].c_str(), NULL, 10)); // freq contacts.emplace(std::move(parts[1]), std::move(ci)); - } return contacts; } -Contacts::Contacts (const std::string& serialized) : - priv_{std::make_unique(serialized)} +Contacts::Contacts (const std::string& serialized, const StringVec& personal) : + priv_{std::make_unique(serialized, personal)} {} Contacts::~Contacts() = default; @@ -170,44 +219,42 @@ Contacts::serialize() const } -// for now, we only care about _not_ having newlines. static void wash (std::string& str) { str.erase(std::remove(str.begin(), str.end(), '\n'), str.end()); } - void Contacts::add (ContactInfo&& ci) { std::lock_guard l_{priv_->mtx_}; - auto down = g_ascii_strdown (ci.email.c_str(), -1); - std::string email{down}; - g_free(down); + auto it = priv_->contacts_.find(ci.email); - auto it = priv_->contacts_.find(email); - if (it != priv_->contacts_.end()) { - auto& ci2 = it->second; - ++ci2.freq; - if (ci.last_seen > ci2.last_seen) { - ci2.last_seen = ci.last_seen; - wash(ci.email); - ci2.email = std::move(ci.email); - if (!ci.name.empty()) { - wash(ci.name); - ci2.name = std::move(ci.name); - } + if (it == priv_->contacts_.end()) { // completely new contact + wash(ci.name); + wash(ci.full_address); + ci.freq = 1; + ci.personal = is_personal(ci.email); + auto email{ci.email}; + priv_->contacts_.emplace(ContactUMap::value_type(email, std::move(ci))); + } else { // existing contact. + auto& ci_existing{it->second}; + ++ci_existing.freq; + + if (ci.last_seen > ci_existing.last_seen) { + // update. + wash(ci.name); + ci_existing.name = std::move(ci.name); + + ci_existing.email = std::move(ci.email); + + wash(ci.full_address); + ci_existing.full_address = std::move(ci.full_address); + ci_existing.tstamp = g_get_monotonic_time(); } } - - wash(ci.name); - wash(ci.email); - wash(ci.full_address); - - priv_->contacts_.emplace( - ContactUMap::value_type(std::move(email), std::move(ci))); } @@ -216,8 +263,7 @@ Contacts::_find (const std::string& email) const { std::lock_guard l_{priv_->mtx_}; - ContactInfo ci{"", email, "", false, 0}; - const auto it = priv_->contacts_.find(ci.email); + const auto it = priv_->contacts_.find(email); if (it == priv_->contacts_.end()) return {}; else @@ -260,6 +306,23 @@ Contacts::for_each(const EachContactFunc& each_contact) const each_contact (ci); } +bool +Contacts::is_personal(const std::string& addr) const +{ + for (auto&& p: priv_->personal_plain_) + if (g_ascii_strcasecmp(addr.c_str(), p.c_str()) == 0) + return true; + + for (auto&& rx: priv_->personal_rx_) { + std::smatch m; // perhaps cache addr in personal_plain_? + if (std::regex_match(addr, m, rx)) + return true; + } + + return false; +} + + /// C binding size_t diff --git a/lib/mu-contacts.hh b/lib/mu-contacts.hh index 7873cd62..c6e4b01e 100644 --- a/lib/mu-contacts.hh +++ b/lib/mu-contacts.hh @@ -34,6 +34,7 @@ typedef struct _MuContacts MuContacts; #include #include #include +#include namespace Mu { @@ -46,25 +47,38 @@ struct ContactInfo { * @param _full_address the full email address + name. * @param _email email address * @param _name name or empty - * @param _personal is this a personal contact? * @param _last_seen when was this contact last seen? - * @param _freq how often was this contact seen? - * - * @return */ ContactInfo (const std::string& _full_address, const std::string& _email, const std::string& _name, - bool _personal, time_t _last_seen, size_t _freq=1); + time_t _last_seen); + + /** + * Construct a new ContactInfo + * + * @param _full_address the full email address + name. + * @param _email email address + * @param _name name or empty + * @param _personal is this a personal contact? + * @param _last_seen when was this contact last seen? + * @param _freq how often was this contact seen? + */ + ContactInfo (const std::string& _full_address, + const std::string& _email, + const std::string& _name, + bool personal, + time_t _last_seen, + size_t freq); std::string full_address; /**< Full name */ std::string email; /**< email address */ std::string name; /**< name (or empty) */ - bool personal; /**< is this a personal contact? */ - time_t last_seen; /**< when was this contact last seen? */ - std::size_t freq; /**< how often was this contact seen? */ + bool personal{}; /**< is this a personal contact? */ + time_t last_seen{}; /**< when was this contact last seen? */ + std::size_t freq{}; /**< how often was this contact seen? */ - int64_t tstamp; /**< Time-stamp, as per g_get_monotonic_time */ + int64_t tstamp{}; /**< Time-stamp, as per g_get_monotonic_time */ }; /// All contacts @@ -74,8 +88,10 @@ public: * Construct a new contacts objects * * @param serialized serialized contacts + * @param personal personal addresses */ - Contacts (const std::string& serialized = ""); + Contacts (const std::string& serialized = "", + const StringVec& personal={}); /** * DTOR @@ -118,6 +134,16 @@ public: */ std::string serialize() const; + + /** + * Does this look like a 'personal' address? + * + * @param addr some e-mail address + * + * @return true or false + */ + bool is_personal(const std::string& addr) const; + /** * Find a contact based on the email address. This is not safe, since * the returned ptr can be invalidated at any time; only for unit-tests. diff --git a/lib/mu-store.cc b/lib/mu-store.cc index cc3b9eac..d0e7cc7a 100644 --- a/lib/mu-store.cc +++ b/lib/mu-store.cc @@ -114,7 +114,7 @@ struct Store::Private { Private (const std::string& path, bool readonly): db_{make_xapian(path, readonly ? XapianOpts::ReadOnly : XapianOpts::Open)}, mdata_{make_metadata(path)}, - contacts_{db()->get_metadata(ContactsKey)} { + contacts_{db()->get_metadata(ContactsKey), mdata_.personal_addresses} { if (!readonly) wdb()->begin_transaction(); @@ -123,7 +123,8 @@ struct Store::Private { Private (const std::string& path, const std::string& root_maildir, const StringVec& personal_addresses, const Store::Config& conf): db_{make_xapian(path, XapianOpts::CreateOverwrite)}, - mdata_{init_metadata(conf, path, root_maildir, personal_addresses)} { + mdata_{init_metadata(conf, path, root_maildir, personal_addresses)}, + contacts_{"", mdata_.personal_addresses} { wdb()->begin_transaction(); } @@ -307,7 +308,6 @@ Store::metadata() const const Contacts& Store::contacts() const { - LOCKED; return priv_->contacts_; } @@ -1045,32 +1045,11 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc) contacts.add(Mu::ContactInfo(contact->full_address, contact->email, contact->name ? contact->name : "", - msgdoc->_personal, mu_msg_get_date(msgdoc->_msg))); } return TRUE; } - - -static gboolean -each_contact_check_if_personal (MuMsgContact *contact, MsgDoc *msgdoc) -{ - if (msgdoc->_personal || !contact->email) - return TRUE; - - for (const auto& cur : *msgdoc->_my_addresses) { - if (g_ascii_strcasecmp - (contact->email, - (const char*)cur.c_str()) == 0) { - msgdoc->_personal = TRUE; - break; - } - } - - return TRUE; -} - static Xapian::Document new_doc_from_message (MuStore *store, MuMsg *msg) { @@ -1079,17 +1058,20 @@ new_doc_from_message (MuStore *store, MuMsg *msg) mu_msg_field_foreach ((MuMsgFieldForeachFunc)add_terms_values, &docinfo); - /* determine whether this is 'personal' email, ie. one of my - * e-mail addresses is explicitly mentioned -- it's not a - * mailing list message. Callback will update docinfo->_personal */ - const auto& personal_addresses = self(store)->metadata().personal_addresses; - if (personal_addresses.size()) { - docinfo._my_addresses = &personal_addresses; - mu_msg_contact_foreach - (msg, - (MuMsgContactForeachFunc)each_contact_check_if_personal, - &docinfo); - } + mu_msg_contact_foreach + (msg, [](auto contact, gpointer msgdocptr)->gboolean { + auto msgdoc{reinterpret_cast(msgdocptr)}; + + if (!contact->email) + return FALSE; // invalid contact + else if (msgdoc->_personal) + return TRUE; // already deemed personal + + if (msgdoc->_store->contacts().is_personal(contact->email)) + msgdoc->_personal = true; // this one's personal. + + return TRUE; + }, &docinfo); /* also store the contact-info as separate terms, and add it * to the cache */ diff --git a/lib/mu-store.hh b/lib/mu-store.hh index 76413520..17942989 100644 --- a/lib/mu-store.hh +++ b/lib/mu-store.hh @@ -96,8 +96,6 @@ public: * @return the metadata */ const Metadata& metadata() const; - - /** * Get the Contacts object for this store * @@ -105,7 +103,6 @@ public: */ const Contacts& contacts() const; - /** * Get the Indexer associated with this store. It is an error * to call this on a read-only store. @@ -177,7 +174,6 @@ public: */ bool contains_message (const std::string& path) const; - /** * Prototype for the ForEachFunc * diff --git a/lib/test-mu-contacts.cc b/lib/test-mu-contacts.cc index 141f8b94..0d598c28 100644 --- a/lib/test-mu-contacts.cc +++ b/lib/test-mu-contacts.cc @@ -33,25 +33,21 @@ test_mu_contacts_01() g_assert_cmpuint (contacts.size(), ==, 0); contacts.add(std::move(Mu::ContactInfo ("Foo ", - "foo.bar@example.com", "Foo", - false, 12345))); + "foo.bar@example.com", "Foo", 12345))); g_assert_false (contacts.empty()); g_assert_cmpuint (contacts.size(), ==, 1); contacts.add(std::move(Mu::ContactInfo ("Cuux ", - "cuux@example.com", "Cuux", true, - 54321))); + "cuux@example.com", "Cuux", 54321))); g_assert_cmpuint (contacts.size(), ==, 2); contacts.add(std::move(Mu::ContactInfo ("foo.bar@example.com", - "foo.bar@example.com", "Foo", - false, 77777))); + "foo.bar@example.com", "Foo", 77777))); g_assert_cmpuint (contacts.size(), ==, 2); contacts.add(std::move(Mu::ContactInfo ("Foo.Bar@Example.Com", - "Foo.Bar@Example.Com", "Foo", - false, 88888))); + "Foo.Bar@Example.Com", "Foo", 88888))); g_assert_cmpuint (contacts.size(), ==, 2); // note: replaces first. @@ -60,7 +56,6 @@ test_mu_contacts_01() g_assert_false (info); } - { const auto info = contacts._find("foo.BAR@example.com"); g_assert_true (info); @@ -73,6 +68,27 @@ test_mu_contacts_01() g_assert_cmpuint (contacts.size(), ==, 0); } +static void +test_mu_contacts_02() +{ + Mu::StringVec personal = { + "foo@example.com", + "bar@cuux.org", + "/bar-.*@fnorb.f./" + }; + Mu::Contacts contacts{"", personal}; + + g_assert_true (contacts.is_personal("foo@example.com")); + g_assert_true (contacts.is_personal("Bar@CuuX.orG")); + g_assert_true (contacts.is_personal("bar-123abc@fnorb.fi")); + g_assert_true (contacts.is_personal("bar-zzz@fnorb.fr")); + + g_assert_false (contacts.is_personal("foo@bar.com")); + g_assert_false (contacts.is_personal("BÂr@CuuX.orG")); + g_assert_false (contacts.is_personal("bar@fnorb.fi")); + g_assert_false (contacts.is_personal("bar-zzz@fnorb.xr")); +} + int @@ -81,6 +97,7 @@ main (int argc, char *argv[]) g_test_init (&argc, &argv, NULL); g_test_add_func ("/mu-contacts/01", test_mu_contacts_01); + g_test_add_func ("/mu-contacts/02", test_mu_contacts_02); g_log_set_handler (NULL, (GLogLevelFlags) diff --git a/lib/test-mu-store.cc b/lib/test-mu-store.cc index cdafd5fe..9b1a1d24 100644 --- a/lib/test-mu-store.cc +++ b/lib/test-mu-store.cc @@ -79,7 +79,6 @@ test_store_add_count_remove () } - int main (int argc, char *argv[]) { diff --git a/man/mu-init.1 b/man/mu-init.1 index 34f95d8d..66aea7f0 100644 --- a/man/mu-init.1 +++ b/man/mu-init.1 @@ -1,4 +1,4 @@ -.TH MU-INIT 1 "February 2020" "User Manuals" +.TH MU-INIT 1 "October 2020" "User Manuals" .SH NAME @@ -10,13 +10,14 @@ mu init \- initialize the mu message database .SH DESCRIPTION -\fBmu init\fR is the \fBmu\fR command for setting up the mu message -database. After \fBmu init\fR has completed, you can run \fBmu index\fR +\fBmu init\fR is the subcommand for setting up the mu message +database. After \fBmu init\fR has completed, you can run \fBmu +index\fR .SH OPTIONS -Note, some of the general options are described in the \fBmu(1)\fR man-page and -not here, as they apply to multiple mu commands. +Note, some of the general options are described in the \fBmu(1)\fR +man-page and not here, as they apply to multiple mu commands. .TP \fB\-\-muhome\fR @@ -34,7 +35,6 @@ are not supported. .TP \fB\-\-my-address\fR=\fI\fR - specifies that some e-mail addresses are 'my-address' (\fB\-\-my-address\fR can be used multiple times). This is used by \fBmu cfind\fR -- any e-mail address found in the address fields of a message which also has \fI\fR @@ -42,6 +42,10 @@ in one of its address fields is considered a \fIpersonal\fR e-mail address. This allows you, for example, to filter out (\fBmu cfind --personal\fR) addresses which were merely seen in mailing list messages. +\fI\fR can be either a plain e-mail address (such as +\fBfoo@example.com\fR), or a regular-expression (of the 'Basic POSIX' +flavor), wrapped in \B/\fR (such as \B/foo-.*@example\\.com\fR). + .SH ENVIRONMENT \fBmu init\fR uses \fBMAILDIR\fR to find the user's Maildir if it has not been