utils: handle "unbroken" scripts

Do not removing combining characters from scripts without explicit word boundaries, such as those for CJK. Reuse some Xapian code for that.
2023-09-09 11:37:53 +03:00 · 2023-09-09 11:37:53 +03:00 · 9c28c65d45
parent 080cf43b4a
commit 9c28c65d45
4 changed files with 162 additions and 9 deletions
--- a/lib/utils/mu-unbroken.hh
+++ b/lib/utils/mu-unbroken.hh
@ -0,0 +1,127 @@
+// borrowed from Xapian; slightly adapted
+
+/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
+ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
+ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
+ * Copyright (c) 2011,2018,2019,2023 Olly Betts
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef MU_UNBROKEN_HH__
+#define MU_UNBROKEN_HH__
+
+#include <algorithm>
+#include <iterator>
+
+/**
+ * Does unichar p belong to a script without explicit word separators?
+ *
+ * @param p
+ *
+ * @return true or false
+ */
+constexpr bool
+is_unbroken_script(unsigned p)
+{
+	// Array containing the last value in each range of codepoints which
+	// are either all in scripts which are written without explicit word
+	// breaks, or all not in such scripts.
+	//
+	// We only include scripts here which ICU has dictionaries for.	 The
+	// same list is currently also used to decide which languages to do
+	// ngrams for, though perhaps that should use a separate list.
+	constexpr unsigned splits[] = {
+		// 0E00..0E7F; Thai, Lanna Tai, Pali
+		// 0E80..0EFF; Lao
+		0x0E00 - 1, 0x0EFF,
+		// 1000..109F; Myanmar (Burmese)
+		0x1000 - 1, 0x109F,
+		// 1100..11FF; Hangul Jamo
+		0x1100 - 1, 0x11FF,
+		// 1780..17FF; Khmer
+		0x1780 - 1, 0x17FF,
+		// 19E0..19FF; Khmer Symbols
+		0x19E0 - 1, 0x19FF,
+		// 2E80..2EFF; CJK Radicals Supplement
+		// 2F00..2FDF; Kangxi Radicals
+		// 2FE0..2FFF; Ideographic Description Characters
+		// 3000..303F; CJK Symbols and Punctuation
+		// 3040..309F; Hiragana
+		// 30A0..30FF; Katakana
+		// 3100..312F; Bopomofo
+		// 3130..318F; Hangul Compatibility Jamo
+		// 3190..319F; Kanbun
+		// 31A0..31BF; Bopomofo Extended
+		// 31C0..31EF; CJK Strokes
+		// 31F0..31FF; Katakana Phonetic Extensions
+		// 3200..32FF; Enclosed CJK Letters and Months
+		// 3300..33FF; CJK Compatibility
+		// 3400..4DBF; CJK Unified Ideographs Extension A
+		// 4DC0..4DFF; Yijing Hexagram Symbols
+		// 4E00..9FFF; CJK Unified Ideographs
+		0x2E80 - 1, 0x9FFF,
+		// A700..A71F; Modifier Tone Letters
+		0xA700 - 1, 0xA71F,
+		// A960..A97F; Hangul Jamo Extended-A
+		0xA960 - 1, 0xA97F,
+		// A9E0..A9FF; Myanmar Extended-B (Burmese)
+		0xA9E0 - 1, 0xA9FF,
+		// AA60..AA7F; Myanmar Extended-A (Burmese)
+		0xAA60 - 1, 0xAA7F,
+		// AC00..D7AF; Hangul Syllables
+		// D7B0..D7FF; Hangul Jamo Extended-B
+		0xAC00 - 1, 0xD7FF,
+		// F900..FAFF; CJK Compatibility Ideographs
+		0xF900 - 1, 0xFAFF,
+		// FE30..FE4F; CJK Compatibility Forms
+		0xFE30 - 1, 0xFE4F,
+		// FF00..FFEF; Halfwidth and Fullwidth Forms
+		0xFF00 - 1, 0xFFEF,
+		// 1AFF0..1AFFF; Kana Extended-B
+		// 1B000..1B0FF; Kana Supplement
+		// 1B100..1B12F; Kana Extended-A
+		// 1B130..1B16F; Small Kana Extension
+		0x1AFF0 - 1, 0x1B16F,
+		// 1F200..1F2FF; Enclosed Ideographic Supplement
+		0x1F200 - 1, 0x1F2FF,
+		// 20000..2A6DF; CJK Unified Ideographs Extension B
+		0x20000 - 1, 0x2A6DF,
+		// 2A700..2B73F; CJK Unified Ideographs Extension C
+		// 2B740..2B81F; CJK Unified Ideographs Extension D
+		// 2B820..2CEAF; CJK Unified Ideographs Extension E
+		// 2CEB0..2EBEF; CJK Unified Ideographs Extension F
+		0x2A700 - 1, 0x2EBEF,
+		// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
+		0x2F800 - 1, 0x2FA1F,
+		// 30000..3134F; CJK Unified Ideographs Extension G
+		// 31350..323AF; CJK Unified Ideographs Extension H
+		0x30000 - 1, 0x323AF
+	};
+	// Binary chop to find the first entry which is >= p.  If it's an odd
+	// offset then the codepoint is in a script which needs splitting; if it's
+	// an even offset then it's not.
+	auto it = std::lower_bound(std::begin(splits),
+				   std::end(splits), p);
+
+	return ((it - splits) & 1);
+}
+
+
+#endif /* MU_UNBROKEN_HH__ */
--- a/lib/utils/mu-utils.cc
+++ b/lib/utils/mu-utils.cc
@ -44,6 +44,8 @@
 #include <glib/gprintf.h>

 #include "mu-utils.hh"
+#include "mu-unbroken.hh"
+
 #include "mu-error.hh"
 #include "mu-option.hh"

@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len)

 } // namespace

+bool
+Mu::contains_unbroken_script(const char *str)
+{
+	while (str && *str) {
+		auto uc = g_utf8_get_char(str);
+		if (is_unbroken_script(uc))
+			return true;
+		str = g_utf8_next_char(str);
+	}
+
+	return false;
+}
+
 std::string // gx_utf8_flatten
 Mu::utf8_flatten(const char* str)
 {
 	if (!str)
 		return {};

+	if (contains_unbroken_script(str))
+		return std::string{str};
+
 	// the pure-ascii case
 	if (g_str_is_ascii(str)) {
 		auto    l = g_ascii_strdown(str, -1);
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@ -154,7 +154,19 @@ std::tm mu_time(T t={}, bool use_utc=false) {
 using StringVec = std::vector<std::string>;

 /**
- * Flatten a string -- downcase and fold diacritics etc.
+ * Does the string contain script without explicit word separators?
+ *
+ * @param str a string
+ *
+ * @return true or false
+ */
+bool contains_unbroken_script(const char* str);
+static inline bool contains_unbroken_script(const std::string& str) {
+	return contains_unbroken_script(str.c_str());
+}
+
+/**
+ * Flatten a string -- down-case and fold diacritics.
 *
 * @param str a string
 *
--- a/lib/utils/tests/test-utils.cc
+++ b/lib/utils/tests/test-utils.cc
@ -45,14 +45,8 @@ test_cases(const CaseVec& cases, ProcFunc proc)
 {
 	for (const auto& casus : cases) {
 		const auto res = proc(casus.expr, casus.is_first);
-		if (g_test_verbose()) {
-			std::cout << "\n";
-			std::cout << casus.expr << ' ' << casus.is_first << std::endl;
-			std::cout << "exp: '" << casus.expected << "'" << std::endl;
-			std::cout << "got: '" << res << "'" << std::endl;
-		}
-
-		g_assert_true(casus.expected == res);
+		//mu_println("'{}'\n'{}'", casus.expected, res);
+		assert_equal(casus.expected, res);
 	}
 }

@ -161,6 +155,8 @@ test_flatten()
 	    {"Менделе́ев", true, "менделеев"},
 	    {"", false, ""},
 	    {"Ångström", true, "angstrom"},
+	    // don't touch combining characters in CJK etc.
+	    {"スポンサーシップ募集",true, "スポンサーシップ募集"}
 	};

 	test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });