1
0
mirror of https://github.com/djcb/mu.git synced 2024-06-20 06:46:50 +02:00

tokenizer: clean unicode-aware

This commit is contained in:
djcb 2017-10-28 14:13:09 +03:00
parent 0e5e8b6bce
commit 55ffb524db
3 changed files with 40 additions and 18 deletions

View File

@ -18,6 +18,8 @@
*/
#include "tokenizer.hh"
#include "utils.hh"
#include <cctype>
#include <iostream>
#include <algorithm>
@ -113,29 +115,12 @@ eat_token (std::string& food, size_t& pos)
}
static std::string
cleanup (const std::string& dirty)
{
auto clean = dirty;
// only accept spc as whitespace
for (auto f = clean.begin(); f != clean.end(); ++f)
if (*f < ' ')
*f = ' ';
clean.erase (0, clean.find_first_not_of(" "));
clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
return clean;
}
Mux::Tokens
Mux::tokenize (const std::string& s)
{
Tokens tokens{};
std::string food = cleanup(s);
std::string food = utf8_clean(s);
size_t pos{0};
if (s.empty())

View File

@ -110,6 +110,32 @@ Mux::utf8_flatten (const std::string& str)
}
std::string
Mux::utf8_clean (const std::string& dirty)
{
GString *gstr = g_string_sized_new (dirty.length());
for (auto cur = dirty.c_str(); cur && *cur; cur = g_utf8_next_char (cur)) {
const gunichar uc = g_utf8_get_char (cur);
if (g_unichar_iscntrl (uc))
g_string_append_c (gstr, ' ');
else
g_string_append_unichar (gstr, uc);
}
std::string clean(gstr->str, gstr->len);
g_string_free (gstr, TRUE);
clean.erase (0, clean.find_first_not_of(" "));
clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
return clean;
}
std::vector<std::string>
Mux::split (const std::string& str, const std::string& sepa)
{

View File

@ -34,6 +34,17 @@ namespace Mux {
*/
std::string utf8_flatten (const std::string& str);
/**
* Replace all control characters with spaces, and remove leading and trailing space.
*
* @param dirty an unclean string
*
* @return a cleaned-up string.
*/
std::string utf8_clean (const std::string& dirty);
/**
* Split a string in parts
*