impotent/stc/priv/utf8_prv.c

/* MIT License
 *
 * Copyright (c) 2025 Tyge Løvset
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef STC_UTF8_PRV_C_INCLUDED
#define STC_UTF8_PRV_C_INCLUDED

#include "utf8_tab.c"

const uint8_t utf8_dtab[] = {
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

int utf8_encode(char *out, uint32_t c) {
    if (c < 0x80U) {
        out[0] = (char) c;
        return 1;
    } else if (c < 0x0800U) {
        out[0] = (char) ((c>>6  & 0x1F) | 0xC0);
        out[1] = (char) ((c     & 0x3F) | 0x80);
        return 2;
    } else if (c < 0x010000U) {
        if ((c < 0xD800U) | (c >= 0xE000U)) {
            out[0] = (char) ((c>>12 & 0x0F) | 0xE0);
            out[1] = (char) ((c>>6  & 0x3F) | 0x80);
            out[2] = (char) ((c     & 0x3F) | 0x80);
            return 3;
        }
    } else if (c < 0x110000U) {
        out[0] = (char) ((c>>18 & 0x07) | 0xF0);
        out[1] = (char) ((c>>12 & 0x3F) | 0x80);
        out[2] = (char) ((c>>6  & 0x3F) | 0x80);
        out[3] = (char) ((c     & 0x3F) | 0x80);
        return 4;
    }
    return 0;
}

uint32_t utf8_peek_at(const char* s, isize offset) {
    return utf8_peek(utf8_offset(s, offset));
}

bool utf8_valid(const char* s) {
    utf8_decode_t d = {.state=0};
    while ((utf8_decode(&d, (uint8_t)*s) != utf8_REJECT) & (*s != '\0'))
        ++s;
    return d.state == utf8_ACCEPT;
}

bool utf8_valid_n(const char* s, isize nbytes) {
    utf8_decode_t d = {.state=0};
    for (; nbytes-- != 0; ++s)
        if ((utf8_decode(&d, (uint8_t)*s) == utf8_REJECT) | (*s == '\0'))
            break;
    return d.state == utf8_ACCEPT;
}

#define _binsearch(c, at, N, ret) do { \
    int _n = N, _i = 0, _mid = _n/2; \
    while (_n > 0) { \
        if (at(_i + _mid) < c) { \
            _i += _mid + 1; \
            _n -= _mid + 1; \
            _mid = _n*7/8; \
        } else { \
            _n = _mid; \
            _mid = _n/8; \
        } \
    } \
    ret = (_i >= N || at(_i) < c) ? N : _i; \
} while (0)

uint32_t utf8_casefold(uint32_t c) {
    #define _at_fold(idx) casemappings[idx].c2
    int i;
    _binsearch(c, _at_fold, casefold_len, i);
    if (i < casefold_len && casemappings[i].c1 <= c && c <= casemappings[i].c2) {
        const struct CaseMapping entry = casemappings[i];
        int d = entry.m2 - entry.c2;
        if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
        return (uint32_t)((int)c + d);
    }
    return c;
}

uint32_t utf8_tolower(uint32_t c) {
    #define _at_upper(idx) casemappings[upcase_ind[idx]].c2
    int i, n = c_countof(upcase_ind);
    _binsearch(c, _at_upper, n, i);
    if (i < n) {
        const struct CaseMapping entry = casemappings[upcase_ind[i]];
        if (entry.c1 <= c && c <= entry.c2) {
            int d = entry.m2 - entry.c2;
            if (d == 1) return c + ((entry.c2 & 1U) == (c & 1U));
            return (uint32_t)((int)c + d);
        }
    }
    return c;
}

uint32_t utf8_toupper(uint32_t c) {
    #define _at_lower(idx) casemappings[lowcase_ind[idx]].m2
    int i, n = c_countof(lowcase_ind);
    _binsearch(c, _at_lower, n, i);
    if (i < n) {
        const struct CaseMapping entry = casemappings[lowcase_ind[i]];
        int d = entry.m2 - entry.c2;
        if (entry.c1 + (uint32_t)d <= c && c <= entry.m2) {
            if (d == 1) return c - ((entry.m2 & 1U) == (c & 1U));
            return (uint32_t)((int)c - d);
        }
    }
    return c;
}

int utf8_decode_codepoint(utf8_decode_t* d, const char* s, const char* end) { // s < end
    const char* start = s;
    do switch (utf8_decode(d, (uint8_t)*s++)) {
        case utf8_ACCEPT: return (int)(s - start);
        case utf8_REJECT: goto recover;
    } while (s != end);

    recover: // non-complete utf8 is also treated as utf8_REJECT
    d->state = utf8_ACCEPT;
    d->codep = 0xFFFD;
    //return 1;
    int n = (int)(s - start);
    return n > 2 ? n - 1 : 1;
}

int utf8_icompare(const csview s1, const csview s2) {
    utf8_decode_t d1 = {.state=0}, d2 = {.state=0};
    const char *e1 = s1.buf + s1.size, *e2 = s2.buf + s2.size;
    isize j1 = 0, j2 = 0;
    while ((j1 < s1.size) & (j2 < s2.size)) {
        if (s2.buf[j2] == '\0') return s1.buf[j1];

        j1 += utf8_decode_codepoint(&d1, s1.buf + j1, e1);
        j2 += utf8_decode_codepoint(&d2, s2.buf + j2, e2);

        int32_t c = (int32_t)utf8_casefold(d1.codep) - (int32_t)utf8_casefold(d2.codep);
        if (c != 0) return (int)c;
    }
    return (int)(s1.size - s2.size);
}

#endif // STC_UTF8_PRV_C_INCLUDED