diff --git a/lib/utils/mu-unbroken.hh b/lib/utils/mu-unbroken.hh new file mode 100644 index 00000000..c1b1d6bc --- /dev/null +++ b/lib/utils/mu-unbroken.hh @@ -0,0 +1,127 @@ +// borrowed from Xapian; slightly adapted + +/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * Copyright (c) 2011,2018,2019,2023 Olly Betts + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef MU_UNBROKEN_HH__ +#define MU_UNBROKEN_HH__ + +#include +#include + +/** + * Does unichar p belong to a script without explicit word separators? + * + * @param p + * + * @return true or false + */ +constexpr bool +is_unbroken_script(unsigned p) +{ + // Array containing the last value in each range of codepoints which + // are either all in scripts which are written without explicit word + // breaks, or all not in such scripts. + // + // We only include scripts here which ICU has dictionaries for. The + // same list is currently also used to decide which languages to do + // ngrams for, though perhaps that should use a separate list. + constexpr unsigned splits[] = { + // 0E00..0E7F; Thai, Lanna Tai, Pali + // 0E80..0EFF; Lao + 0x0E00 - 1, 0x0EFF, + // 1000..109F; Myanmar (Burmese) + 0x1000 - 1, 0x109F, + // 1100..11FF; Hangul Jamo + 0x1100 - 1, 0x11FF, + // 1780..17FF; Khmer + 0x1780 - 1, 0x17FF, + // 19E0..19FF; Khmer Symbols + 0x19E0 - 1, 0x19FF, + // 2E80..2EFF; CJK Radicals Supplement + // 2F00..2FDF; Kangxi Radicals + // 2FE0..2FFF; Ideographic Description Characters + // 3000..303F; CJK Symbols and Punctuation + // 3040..309F; Hiragana + // 30A0..30FF; Katakana + // 3100..312F; Bopomofo + // 3130..318F; Hangul Compatibility Jamo + // 3190..319F; Kanbun + // 31A0..31BF; Bopomofo Extended + // 31C0..31EF; CJK Strokes + // 31F0..31FF; Katakana Phonetic Extensions + // 3200..32FF; Enclosed CJK Letters and Months + // 3300..33FF; CJK Compatibility + // 3400..4DBF; CJK Unified Ideographs Extension A + // 4DC0..4DFF; Yijing Hexagram Symbols + // 4E00..9FFF; CJK Unified Ideographs + 0x2E80 - 1, 0x9FFF, + // A700..A71F; Modifier Tone Letters + 0xA700 - 1, 0xA71F, + // A960..A97F; Hangul Jamo Extended-A + 0xA960 - 1, 0xA97F, + // A9E0..A9FF; Myanmar Extended-B (Burmese) + 0xA9E0 - 1, 0xA9FF, + // AA60..AA7F; Myanmar Extended-A (Burmese) + 0xAA60 - 1, 0xAA7F, + // AC00..D7AF; Hangul Syllables + // D7B0..D7FF; Hangul Jamo Extended-B + 0xAC00 - 1, 0xD7FF, + // F900..FAFF; CJK Compatibility Ideographs + 0xF900 - 1, 0xFAFF, + // FE30..FE4F; CJK Compatibility Forms + 0xFE30 - 1, 0xFE4F, + // FF00..FFEF; Halfwidth and Fullwidth Forms + 0xFF00 - 1, 0xFFEF, + // 1AFF0..1AFFF; Kana Extended-B + // 1B000..1B0FF; Kana Supplement + // 1B100..1B12F; Kana Extended-A + // 1B130..1B16F; Small Kana Extension + 0x1AFF0 - 1, 0x1B16F, + // 1F200..1F2FF; Enclosed Ideographic Supplement + 0x1F200 - 1, 0x1F2FF, + // 20000..2A6DF; CJK Unified Ideographs Extension B + 0x20000 - 1, 0x2A6DF, + // 2A700..2B73F; CJK Unified Ideographs Extension C + // 2B740..2B81F; CJK Unified Ideographs Extension D + // 2B820..2CEAF; CJK Unified Ideographs Extension E + // 2CEB0..2EBEF; CJK Unified Ideographs Extension F + 0x2A700 - 1, 0x2EBEF, + // 2F800..2FA1F; CJK Compatibility Ideographs Supplement + 0x2F800 - 1, 0x2FA1F, + // 30000..3134F; CJK Unified Ideographs Extension G + // 31350..323AF; CJK Unified Ideographs Extension H + 0x30000 - 1, 0x323AF + }; + // Binary chop to find the first entry which is >= p. If it's an odd + // offset then the codepoint is in a script which needs splitting; if it's + // an even offset then it's not. + auto it = std::lower_bound(std::begin(splits), + std::end(splits), p); + + return ((it - splits) & 1); +} + + +#endif /* MU_UNBROKEN_HH__ */ diff --git a/lib/utils/mu-utils.cc b/lib/utils/mu-utils.cc index 1702ddca..e60b65ff 100644 --- a/lib/utils/mu-utils.cc +++ b/lib/utils/mu-utils.cc @@ -44,6 +44,8 @@ #include #include "mu-utils.hh" +#include "mu-unbroken.hh" + #include "mu-error.hh" #include "mu-option.hh" @@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len) } // namespace +bool +Mu::contains_unbroken_script(const char *str) +{ + while (str && *str) { + auto uc = g_utf8_get_char(str); + if (is_unbroken_script(uc)) + return true; + str = g_utf8_next_char(str); + } + + return false; +} + std::string // gx_utf8_flatten Mu::utf8_flatten(const char* str) { if (!str) return {}; + if (contains_unbroken_script(str)) + return std::string{str}; + // the pure-ascii case if (g_str_is_ascii(str)) { auto l = g_ascii_strdown(str, -1); diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh index 73f32dfd..6ca0e85f 100644 --- a/lib/utils/mu-utils.hh +++ b/lib/utils/mu-utils.hh @@ -154,7 +154,19 @@ std::tm mu_time(T t={}, bool use_utc=false) { using StringVec = std::vector; /** - * Flatten a string -- downcase and fold diacritics etc. + * Does the string contain script without explicit word separators? + * + * @param str a string + * + * @return true or false + */ +bool contains_unbroken_script(const char* str); +static inline bool contains_unbroken_script(const std::string& str) { + return contains_unbroken_script(str.c_str()); +} + +/** + * Flatten a string -- down-case and fold diacritics. * * @param str a string * diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc index 56bb95a5..f0e98412 100644 --- a/lib/utils/tests/test-utils.cc +++ b/lib/utils/tests/test-utils.cc @@ -45,14 +45,8 @@ test_cases(const CaseVec& cases, ProcFunc proc) { for (const auto& casus : cases) { const auto res = proc(casus.expr, casus.is_first); - if (g_test_verbose()) { - std::cout << "\n"; - std::cout << casus.expr << ' ' << casus.is_first << std::endl; - std::cout << "exp: '" << casus.expected << "'" << std::endl; - std::cout << "got: '" << res << "'" << std::endl; - } - - g_assert_true(casus.expected == res); + //mu_println("'{}'\n'{}'", casus.expected, res); + assert_equal(casus.expected, res); } } @@ -161,6 +155,8 @@ test_flatten() {"Менделе́ев", true, "менделеев"}, {"", false, ""}, {"Ångström", true, "angstrom"}, + // don't touch combining characters in CJK etc. + {"スポンサーシップ募集",true, "スポンサーシップ募集"} }; test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });