impotent/stc/priv/utf8_prv.c
2025-08-31 16:22:38 +03:00

178 lines
6.2 KiB
C

/* MIT License
*
* Copyright (c) 2025 Tyge Løvset
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef STC_UTF8_PRV_C_INCLUDED
#define STC_UTF8_PRV_C_INCLUDED
#include "utf8_tab.c"
const uint8_t utf8_dtab[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
int utf8_encode(char *out, uint32_t c) {
if (c < 0x80U) {
out[0] = (char) c;
return 1;
} else if (c < 0x0800U) {
out[0] = (char) ((c>>6 & 0x1F) | 0xC0);
out[1] = (char) ((c & 0x3F) | 0x80);
return 2;
} else if (c < 0x010000U) {
if ((c < 0xD800U) | (c >= 0xE000U)) {
out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
out[1] = (char) ((c>>6 & 0x3F) | 0x80);
out[2] = (char) ((c & 0x3F) | 0x80);
return 3;
}
} else if (c < 0x110000U) {
out[0] = (char) ((c>>18 & 0x07) | 0xF0);
out[1] = (char) ((c>>12 & 0x3F) | 0x80);
out[2] = (char) ((c>>6 & 0x3F) | 0x80);
out[3] = (char) ((c & 0x3F) | 0x80);
return 4;
}
return 0;
}
uint32_t utf8_peek_at(const char* s, isize offset) {
return utf8_peek(utf8_offset(s, offset));
}
bool utf8_valid(const char* s) {
utf8_decode_t d = {.state=0};
while ((utf8_decode(&d, (uint8_t)*s) != utf8_REJECT) & (*s != '\0'))
++s;
return d.state == utf8_ACCEPT;
}
bool utf8_valid_n(const char* s, isize nbytes) {
utf8_decode_t d = {.state=0};
for (; nbytes-- != 0; ++s)
if ((utf8_decode(&d, (uint8_t)*s) == utf8_REJECT) | (*s == '\0'))
break;
return d.state == utf8_ACCEPT;
}
#define _binsearch(c, at, N, ret) do { \
int _n = N, _i = 0, _mid = _n/2; \
while (_n > 0) { \
if (at(_i + _mid) < c) { \
_i += _mid + 1; \
_n -= _mid + 1; \
_mid = _n*7/8; \
} else { \
_n = _mid; \
_mid = _n/8; \
} \
} \
ret = (_i >= N || at(_i) < c) ? N : _i; \
} while (0)
uint32_t utf8_casefold(uint32_t c) {
#define _at_fold(idx) casemappings[idx].c2
int i;
_binsearch(c, _at_fold, casefold_len, i);
if (i < casefold_len && casemappings[i].c1 <= c && c <= casemappings[i].c2) {
const struct CaseMapping entry = casemappings[i];
int d = entry.m2 - entry.c2;
if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
return (uint32_t)((int)c + d);
}
return c;
}
uint32_t utf8_tolower(uint32_t c) {
#define _at_upper(idx) casemappings[upcase_ind[idx]].c2
int i, n = c_countof(upcase_ind);
_binsearch(c, _at_upper, n, i);
if (i < n) {
const struct CaseMapping entry = casemappings[upcase_ind[i]];
if (entry.c1 <= c && c <= entry.c2) {
int d = entry.m2 - entry.c2;
if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
return (uint32_t)((int)c + d);
}
}
return c;
}
uint32_t utf8_toupper(uint32_t c) {
#define _at_lower(idx) casemappings[lowcase_ind[idx]].m2
int i, n = c_countof(lowcase_ind);
_binsearch(c, _at_lower, n, i);
if (i < n) {
const struct CaseMapping entry = casemappings[lowcase_ind[i]];
int d = entry.m2 - entry.c2;
if (entry.c1 + (uint32_t)d <= c && c <= entry.m2) {
if (d == 1) return c - ((entry.m2 & 1U) == (c & 1U));
return (uint32_t)((int)c - d);
}
}
return c;
}
int utf8_decode_codepoint(utf8_decode_t* d, const char* s, const char* end) { // s < end
const char* start = s;
do switch (utf8_decode(d, (uint8_t)*s++)) {
case utf8_ACCEPT: return (int)(s - start);
case utf8_REJECT: goto recover;
} while (s != end);
recover: // non-complete utf8 is also treated as utf8_REJECT
d->state = utf8_ACCEPT;
d->codep = 0xFFFD;
//return 1;
int n = (int)(s - start);
return n > 2 ? n - 1 : 1;
}
int utf8_icompare(const csview s1, const csview s2) {
utf8_decode_t d1 = {.state=0}, d2 = {.state=0};
const char *e1 = s1.buf + s1.size, *e2 = s2.buf + s2.size;
isize j1 = 0, j2 = 0;
while ((j1 < s1.size) & (j2 < s2.size)) {
if (s2.buf[j2] == '\0') return s1.buf[j1];
j1 += utf8_decode_codepoint(&d1, s1.buf + j1, e1);
j2 += utf8_decode_codepoint(&d2, s2.buf + j2, e2);
int32_t c = (int32_t)utf8_casefold(d1.codep) - (int32_t)utf8_casefold(d2.codep);
if (c != 0) return (int)c;
}
return (int)(s1.size - s2.size);
}
#endif // STC_UTF8_PRV_C_INCLUDED