/* MIT License * * Copyright (c) 2025 Tyge Løvset * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef STC_UCD_PRV_C_INCLUDED #define STC_UCD_PRV_C_INCLUDED #include // ------------------------------------------------------ // The following requires linking with utf8 symbols. // To call them, either define i_import before including // one of cstr, csview, zsview, or link with src/libstc.o. enum { U8G_Cc, U8G_Lt, U8G_Nd, U8G_Nl, U8G_Pc, U8G_Pd, U8G_Pf, U8G_Pi, U8G_Sc, U8G_Zl, U8G_Zp, U8G_Zs, U8G_Arabic, U8G_Bengali, U8G_Cyrillic, U8G_Devanagari, U8G_Georgian, U8G_Greek, U8G_Han, U8G_Hiragana, U8G_Katakana, U8G_Latin, U8G_Thai, U8G_SIZE }; static bool utf8_isgroup(int group, uint32_t c); static bool utf8_isalpha(uint32_t c) { static int16_t groups[] = {U8G_Latin, U8G_Nl, U8G_Cyrillic, U8G_Han, U8G_Devanagari, U8G_Arabic, U8G_Bengali, U8G_Hiragana, U8G_Katakana, U8G_Thai, U8G_Greek, U8G_Georgian}; if (c < 128) return isalpha((int)c) != 0; for (int j=0; j < (int)(sizeof groups/sizeof groups[0]); ++j) if (utf8_isgroup(groups[j], c)) return true; return false; } static bool utf8_iscased(uint32_t c) { if (c < 128) return isalpha((int)c) != 0; return utf8_islower(c) || utf8_isupper(c) || utf8_isgroup(U8G_Lt, c); } static bool utf8_isalnum(uint32_t c) { if (c < 128) return isalnum((int)c) != 0; return utf8_isalpha(c) || utf8_isgroup(U8G_Nd, c); } static bool utf8_isword(uint32_t c) { if (c < 128) return (isalnum((int)c) != 0) | (c == '_'); return utf8_isalpha(c) || utf8_isgroup(U8G_Nd, c) || utf8_isgroup(U8G_Pc, c); } static bool utf8_isblank(uint32_t c) { if (c < 128) return (c == ' ') | (c == '\t'); return utf8_isgroup(U8G_Zs, c); } static bool utf8_isspace(uint32_t c) { if (c < 128) return isspace((int)c) != 0; return ((c == 8232) | (c == 8233)) || utf8_isgroup(U8G_Zs, c); } /* The tables below are extracted from the RE2 library */ typedef struct { uint16_t lo; uint16_t hi; } URange16; static const URange16 Cc_range16[] = { // Control { 0, 31 }, { 127, 159 }, }; static const URange16 Lt_range16[] = { // Title case { 453, 453 }, { 456, 456 }, { 459, 459 }, { 498, 498 }, { 8072, 8079 }, { 8088, 8095 }, { 8104, 8111 }, { 8124, 8124 }, { 8140, 8140 }, { 8188, 8188 }, }; static const URange16 Nd_range16[] = { // Decimal number { 48, 57 }, { 1632, 1641 }, { 1776, 1785 }, { 1984, 1993 }, { 2406, 2415 }, { 2534, 2543 }, { 2662, 2671 }, { 2790, 2799 }, { 2918, 2927 }, { 3046, 3055 }, { 3174, 3183 }, { 3302, 3311 }, { 3430, 3439 }, { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, { 3872, 3881 }, { 4160, 4169 }, { 4240, 4249 }, { 6112, 6121 }, { 6160, 6169 }, { 6470, 6479 }, { 6608, 6617 }, { 6784, 6793 }, { 6800, 6809 }, { 6992, 7001 }, { 7088, 7097 }, { 7232, 7241 }, { 7248, 7257 }, { 42528, 42537 }, { 43216, 43225 }, { 43264, 43273 }, { 43472, 43481 }, { 43504, 43513 }, { 43600, 43609 }, { 44016, 44025 }, { 65296, 65305 }, }; static const URange16 Nl_range16[] = { // Number letter { 5870, 5872 }, { 8544, 8578 }, { 8581, 8584 }, { 12295, 12295 }, { 12321, 12329 }, { 12344, 12346 }, { 42726, 42735 }, }; static const URange16 Pc_range16[] = { // Connector punctuation { 95, 95 }, { 8255, 8256 }, { 8276, 8276 }, { 65075, 65076 }, { 65101, 65103 }, { 65343, 65343 }, }; static const URange16 Pd_range16[] = { // Dash punctuation { 45, 45 }, { 1418, 1418 }, { 1470, 1470 }, { 5120, 5120 }, { 6150, 6150 }, { 8208, 8213 }, { 11799, 11799 }, { 11802, 11802 }, { 11834, 11835 }, { 11840, 11840 }, { 11869, 11869 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, { 65073, 65074 }, { 65112, 65112 }, { 65123, 65123 }, { 65293, 65293 }, }; static const URange16 Pf_range16[] = { // Final punctuation { 187, 187 }, { 8217, 8217 }, { 8221, 8221 }, { 8250, 8250 }, { 11779, 11779 }, { 11781, 11781 }, { 11786, 11786 }, { 11789, 11789 }, { 11805, 11805 }, { 11809, 11809 }, }; static const URange16 Pi_range16[] = { // Initial punctuation { 171, 171 }, { 8216, 8216 }, { 8219, 8220 }, { 8223, 8223 }, { 8249, 8249 }, { 11778, 11778 }, { 11780, 11780 }, { 11785, 11785 }, { 11788, 11788 }, { 11804, 11804 }, { 11808, 11808 }, }; static const URange16 Sc_range16[] = { // Currency symbol { 36, 36 }, { 162, 165 }, { 1423, 1423 }, { 1547, 1547 }, { 2046, 2047 }, { 2546, 2547 }, { 2555, 2555 }, { 2801, 2801 }, { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, { 8352, 8384 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, { 65284, 65284 }, { 65504, 65505 }, { 65509, 65510 }, }; static const URange16 Zl_range16[] = { // Line separator { 8232, 8232 }, }; static const URange16 Zp_range16[] = { // Paragraph separator { 8233, 8233 }, }; static const URange16 Zs_range16[] = { // Space separator { 32, 32 }, { 160, 160 }, { 5760, 5760 }, { 8192, 8202 }, { 8239, 8239 }, { 8287, 8287 }, { 12288, 12288 }, }; static const URange16 Arabic_range16[] = { { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, { 1564, 1566 }, { 1568, 1599 }, { 1601, 1610 }, { 1622, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, { 2160, 2190 }, { 2192, 2193 }, { 2200, 2273 }, { 2275, 2303 }, { 64336, 64450 }, { 64467, 64829 }, { 64832, 64911 }, { 64914, 64967 }, { 64975, 64975 }, { 65008, 65023 }, { 65136, 65140 }, { 65142, 65276 }, }; static const URange16 Bengali_range16[] = { { 2432, 2435 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, { 2474, 2480 }, { 2482, 2482 }, { 2486, 2489 }, { 2492, 2500 }, { 2503, 2504 }, { 2507, 2510 }, { 2519, 2519 }, { 2524, 2525 }, { 2527, 2531 }, { 2534, 2558 }, }; static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1327 }, { 7296, 7304 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, { 42560, 42655 }, { 65070, 65071 }, }; static const URange16 Devanagari_range16[] = { { 2304, 2384 }, { 2389, 2403 }, { 2406, 2431 }, { 43232, 43263 }, }; static const URange16 Georgian_range16[] = { { 4256, 4293 }, { 4295, 4295 }, { 4301, 4301 }, { 4304, 4346 }, { 4348, 4351 }, { 7312, 7354 }, { 7357, 7359 }, { 11520, 11557 }, { 11559, 11559 }, { 11565, 11565 }, }; static const URange16 Greek_range16[] = { { 880, 883 }, { 885, 887 }, { 890, 893 }, { 895, 895 }, { 900, 900 }, { 902, 902 }, { 904, 906 }, { 908, 908 }, { 910, 929 }, { 931, 993 }, { 1008, 1023 }, { 7462, 7466 }, { 7517, 7521 }, { 7526, 7530 }, { 7615, 7615 }, { 7936, 7957 }, { 7960, 7965 }, { 7968, 8005 }, { 8008, 8013 }, { 8016, 8023 }, { 8025, 8025 }, { 8027, 8027 }, { 8029, 8029 }, { 8031, 8061 }, { 8064, 8116 }, { 8118, 8132 }, { 8134, 8147 }, { 8150, 8155 }, { 8157, 8175 }, { 8178, 8180 }, { 8182, 8190 }, { 8486, 8486 }, { 43877, 43877 }, }; static const URange16 Han_range16[] = { { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, { 12293, 12293 }, { 12295, 12295 }, { 12321, 12329 }, { 12344, 12347 }, { 13312, 19903 }, { 19968, 40959 }, { 63744, 64109 }, { 64112, 64217 }, }; static const URange16 Hiragana_range16[] = { { 12353, 12438 }, { 12445, 12447 }, }; static const URange16 Katakana_range16[] = { { 12449, 12538 }, { 12541, 12543 }, { 12784, 12799 }, { 13008, 13054 }, { 13056, 13143 }, { 65382, 65391 }, { 65393, 65437 }, }; static const URange16 Latin_range16[] = { { 65, 90 }, { 97, 122 }, { 170, 170 }, { 186, 186 }, { 192, 214 }, { 216, 246 }, { 248, 696 }, { 736, 740 }, { 7424, 7461 }, { 7468, 7516 }, { 7522, 7525 }, { 7531, 7543 }, { 7545, 7614 }, { 7680, 7935 }, { 8305, 8305 }, { 8319, 8319 }, { 8336, 8348 }, { 8490, 8491 }, { 8498, 8498 }, { 8526, 8526 }, { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, { 42891, 42954 }, { 42960, 42961 }, { 42963, 42963 }, { 42965, 42969 }, { 42994, 43007 }, { 43824, 43866 }, { 43868, 43876 }, { 43878, 43881 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, }; static const URange16 Thai_range16[] = { { 3585, 3642 }, { 3648, 3675 }, }; #ifdef __cplusplus #define _e_arg(k, v) v #else #define _e_arg(k, v) [k] = v #endif #define UNI_ENTRY(Code) { Code##_range16, sizeof(Code##_range16)/sizeof(URange16) } typedef struct { const URange16 *r16; int nr16; } UGroup; static const UGroup _utf8_unicode_groups[U8G_SIZE] = { _e_arg(U8G_Cc, UNI_ENTRY(Cc)), _e_arg(U8G_Lt, UNI_ENTRY(Lt)), _e_arg(U8G_Nd, UNI_ENTRY(Nd)), _e_arg(U8G_Nl, UNI_ENTRY(Nl)), _e_arg(U8G_Pc, UNI_ENTRY(Pc)), _e_arg(U8G_Pd, UNI_ENTRY(Pd)), _e_arg(U8G_Pf, UNI_ENTRY(Pf)), _e_arg(U8G_Pi, UNI_ENTRY(Pi)), _e_arg(U8G_Sc, UNI_ENTRY(Sc)), _e_arg(U8G_Zl, UNI_ENTRY(Zl)), _e_arg(U8G_Zp, UNI_ENTRY(Zp)), _e_arg(U8G_Zs, UNI_ENTRY(Zs)), _e_arg(U8G_Arabic, UNI_ENTRY(Arabic)), _e_arg(U8G_Bengali, UNI_ENTRY(Bengali)), _e_arg(U8G_Cyrillic, UNI_ENTRY(Cyrillic)), _e_arg(U8G_Devanagari, UNI_ENTRY(Devanagari)), _e_arg(U8G_Georgian, UNI_ENTRY(Georgian)), _e_arg(U8G_Greek, UNI_ENTRY(Greek)), _e_arg(U8G_Han, UNI_ENTRY(Han)), _e_arg(U8G_Hiragana, UNI_ENTRY(Hiragana)), _e_arg(U8G_Katakana, UNI_ENTRY(Katakana)), _e_arg(U8G_Latin, UNI_ENTRY(Latin)), _e_arg(U8G_Thai, UNI_ENTRY(Thai)), }; static bool utf8_isgroup(int group, uint32_t c) { for (int j=0; j<_utf8_unicode_groups[group].nr16; ++j) { if (c < _utf8_unicode_groups[group].r16[j].lo) return false; if (c <= _utf8_unicode_groups[group].r16[j].hi) return true; } return false; } #endif // STC_UCD_PRV_C_INCLUDED