178 lines
6.2 KiB
C
178 lines
6.2 KiB
C
/* MIT License
|
|
*
|
|
* Copyright (c) 2025 Tyge Løvset
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in all
|
|
* copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
#ifndef STC_UTF8_PRV_C_INCLUDED
|
|
#define STC_UTF8_PRV_C_INCLUDED
|
|
|
|
#include "utf8_tab.c"
|
|
|
|
const uint8_t utf8_dtab[] = {
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
};
|
|
|
|
int utf8_encode(char *out, uint32_t c) {
|
|
if (c < 0x80U) {
|
|
out[0] = (char) c;
|
|
return 1;
|
|
} else if (c < 0x0800U) {
|
|
out[0] = (char) ((c>>6 & 0x1F) | 0xC0);
|
|
out[1] = (char) ((c & 0x3F) | 0x80);
|
|
return 2;
|
|
} else if (c < 0x010000U) {
|
|
if ((c < 0xD800U) | (c >= 0xE000U)) {
|
|
out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
|
|
out[1] = (char) ((c>>6 & 0x3F) | 0x80);
|
|
out[2] = (char) ((c & 0x3F) | 0x80);
|
|
return 3;
|
|
}
|
|
} else if (c < 0x110000U) {
|
|
out[0] = (char) ((c>>18 & 0x07) | 0xF0);
|
|
out[1] = (char) ((c>>12 & 0x3F) | 0x80);
|
|
out[2] = (char) ((c>>6 & 0x3F) | 0x80);
|
|
out[3] = (char) ((c & 0x3F) | 0x80);
|
|
return 4;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
uint32_t utf8_peek_at(const char* s, isize offset) {
|
|
return utf8_peek(utf8_offset(s, offset));
|
|
}
|
|
|
|
bool utf8_valid(const char* s) {
|
|
utf8_decode_t d = {.state=0};
|
|
while ((utf8_decode(&d, (uint8_t)*s) != utf8_REJECT) & (*s != '\0'))
|
|
++s;
|
|
return d.state == utf8_ACCEPT;
|
|
}
|
|
|
|
bool utf8_valid_n(const char* s, isize nbytes) {
|
|
utf8_decode_t d = {.state=0};
|
|
for (; nbytes-- != 0; ++s)
|
|
if ((utf8_decode(&d, (uint8_t)*s) == utf8_REJECT) | (*s == '\0'))
|
|
break;
|
|
return d.state == utf8_ACCEPT;
|
|
}
|
|
|
|
#define _binsearch(c, at, N, ret) do { \
|
|
int _n = N, _i = 0, _mid = _n/2; \
|
|
while (_n > 0) { \
|
|
if (at(_i + _mid) < c) { \
|
|
_i += _mid + 1; \
|
|
_n -= _mid + 1; \
|
|
_mid = _n*7/8; \
|
|
} else { \
|
|
_n = _mid; \
|
|
_mid = _n/8; \
|
|
} \
|
|
} \
|
|
ret = (_i >= N || at(_i) < c) ? N : _i; \
|
|
} while (0)
|
|
|
|
uint32_t utf8_casefold(uint32_t c) {
|
|
#define _at_fold(idx) casemappings[idx].c2
|
|
int i;
|
|
_binsearch(c, _at_fold, casefold_len, i);
|
|
if (i < casefold_len && casemappings[i].c1 <= c && c <= casemappings[i].c2) {
|
|
const struct CaseMapping entry = casemappings[i];
|
|
int d = entry.m2 - entry.c2;
|
|
if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
|
|
return (uint32_t)((int)c + d);
|
|
}
|
|
return c;
|
|
}
|
|
|
|
uint32_t utf8_tolower(uint32_t c) {
|
|
#define _at_upper(idx) casemappings[upcase_ind[idx]].c2
|
|
int i, n = c_countof(upcase_ind);
|
|
_binsearch(c, _at_upper, n, i);
|
|
if (i < n) {
|
|
const struct CaseMapping entry = casemappings[upcase_ind[i]];
|
|
if (entry.c1 <= c && c <= entry.c2) {
|
|
int d = entry.m2 - entry.c2;
|
|
if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
|
|
return (uint32_t)((int)c + d);
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
uint32_t utf8_toupper(uint32_t c) {
|
|
#define _at_lower(idx) casemappings[lowcase_ind[idx]].m2
|
|
int i, n = c_countof(lowcase_ind);
|
|
_binsearch(c, _at_lower, n, i);
|
|
if (i < n) {
|
|
const struct CaseMapping entry = casemappings[lowcase_ind[i]];
|
|
int d = entry.m2 - entry.c2;
|
|
if (entry.c1 + (uint32_t)d <= c && c <= entry.m2) {
|
|
if (d == 1) return c - ((entry.m2 & 1U) == (c & 1U));
|
|
return (uint32_t)((int)c - d);
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
int utf8_decode_codepoint(utf8_decode_t* d, const char* s, const char* end) { // s < end
|
|
const char* start = s;
|
|
do switch (utf8_decode(d, (uint8_t)*s++)) {
|
|
case utf8_ACCEPT: return (int)(s - start);
|
|
case utf8_REJECT: goto recover;
|
|
} while (s != end);
|
|
|
|
recover: // non-complete utf8 is also treated as utf8_REJECT
|
|
d->state = utf8_ACCEPT;
|
|
d->codep = 0xFFFD;
|
|
//return 1;
|
|
int n = (int)(s - start);
|
|
return n > 2 ? n - 1 : 1;
|
|
}
|
|
|
|
int utf8_icompare(const csview s1, const csview s2) {
|
|
utf8_decode_t d1 = {.state=0}, d2 = {.state=0};
|
|
const char *e1 = s1.buf + s1.size, *e2 = s2.buf + s2.size;
|
|
isize j1 = 0, j2 = 0;
|
|
while ((j1 < s1.size) & (j2 < s2.size)) {
|
|
if (s2.buf[j2] == '\0') return s1.buf[j1];
|
|
|
|
j1 += utf8_decode_codepoint(&d1, s1.buf + j1, e1);
|
|
j2 += utf8_decode_codepoint(&d2, s2.buf + j2, e2);
|
|
|
|
int32_t c = (int32_t)utf8_casefold(d1.codep) - (int32_t)utf8_casefold(d2.codep);
|
|
if (c != 0) return (int)c;
|
|
}
|
|
return (int)(s1.size - s2.size);
|
|
}
|
|
|
|
#endif // STC_UTF8_PRV_C_INCLUDED
|