impotent/lexer.c
2025-09-03 01:03:45 +03:00

160 lines
3.7 KiB
C

#define _GNU_SOURCE
#define _POSIX_C_SOURCE 200809L
#define i_implement
#include"lexer.h"
typedef struct {
const char *txt;
TokenType type;
} KW;
static KW KWS[] = {
{"return", TOK_RETURN},
{"if", TOK_IF},
{"break", TOK_BREAK},
{"goto", TOK_GOTO},
{"end", TOK_END},
{"do", TOK_DO},
{"for", TOK_FOR},
{"repeat", TOK_REPEAT},
{"until", TOK_UNTIL},
{"local", TOK_LOCAL},
{"then", TOK_THEN},
{"else", TOK_ELSE},
{"elseif", TOK_ELSEIF},
{"false", TOK_FALSE},
{"true", TOK_TRUE},
{"nil", TOK_NIL},
{"function", TOK_FUNCTION},
{"while", TOK_WHILE},
};
static TokenType is_kw(const char *str, size_t len) {
for(int i = 0; i < sizeof(KWS) / sizeof(*KWS); i++) {
if(len == strlen(KWS[i].txt) && !strncmp(KWS[i].txt, str, len)) {
return KWS[i].type;
}
}
return TOK_NAME;
}
vec_Token ltokenize(const char *buf, size_t len) {
vec_Token tokens = {};
size_t row = 1;
while(len) {
if(isspace(buf[0])) {
if(buf[0] == '\n') {
row++;
}
buf++, len--;
} else if(isalpha(buf[0])) {
size_t idlen = 0;
while(idlen < len && isalnum(buf[idlen])) {
idlen++;
}
TokenType tt = is_kw(buf, idlen);
vec_Token_push(&tokens, (Token) {.text = tt == TOK_NAME ? strndup(buf, idlen) : NULL, .type = tt});
buf += idlen, len -= idlen;
} else if(buf[0] == '+') {
vec_Token_push(&tokens, (Token) {.type = TOK_PLUS});
buf++, len--;
} else if(buf[0] == '=') {
if(len > 1 && buf[1] == '=') {
vec_Token_push(&tokens, (Token) {.type = TOK_DOUBLE_EQUAL});
buf++, len--;
buf++, len--;
} else {
vec_Token_push(&tokens, (Token) {.type = TOK_EQUAL});
buf++, len--;
}
} else if(buf[0] == '(') {
vec_Token_push(&tokens, (Token) {.type = TOK_PAREN_L});
buf++, len--;
} else if(buf[0] == ')') {
vec_Token_push(&tokens, (Token) {.type = TOK_PAREN_R});
buf++, len--;
} else if(buf[0] == '[') {
vec_Token_push(&tokens, (Token) {.type = TOK_SQUAREN_L});
buf++, len--;
} else if(buf[0] == ']') {
vec_Token_push(&tokens, (Token) {.type = TOK_SQUAREN_R});
buf++, len--;
} else if(buf[0] == '.') {
vec_Token_push(&tokens, (Token) {.type = TOK_DOT});
buf++, len--;
} else if(buf[0] == ',') {
vec_Token_push(&tokens, (Token) {.type = TOK_COMMA});
buf++, len--;
} else if(buf[0] == '%') {
vec_Token_push(&tokens, (Token) {.type = TOK_PERCENT});
buf++, len--;
} else if(buf[0] == '{') {
vec_Token_push(&tokens, (Token) {.type = TOK_SQUIGGLY_L});
buf++, len--;
} else if(buf[0] == '}') {
vec_Token_push(&tokens, (Token) {.type = TOK_SQUIGGLY_R});
buf++, len--;
} else if(len > 1 && buf[0] == '~' && buf[1] == '=') {
vec_Token_push(&tokens, (Token) {.type = TOK_NOT_EQUAL});
buf++, len--;
buf++, len--;
} else if(isdigit(buf[0]) || (len > 1 && buf[0] == '-' && isdigit(buf[1]))) {
size_t idlen = 0;
if(buf[0] == '-') {
idlen++;
}
while(idlen < len && isdigit(buf[idlen])) {
idlen++;
}
vec_Token_push(&tokens, (Token) {.text = strndup(buf, idlen), .type = TOK_NUMBER});
buf += idlen, len -= idlen;
} else if(buf[0] == '\'' || buf[0] == '\"') {
bool single = buf[0] == '\'';
buf++, len--;
size_t strlen = 1;
while(strlen < len) {
if(buf[strlen] == '\\') {
strlen += 2;
continue;
} else if(buf[strlen] == (single ? '\'' : '\"')) {
strlen++;
break;
}
strlen++;
}
char *str = strndup(buf, strlen - 1);
// TODO: unescaping
vec_Token_push(&tokens, (Token) {.text = str, .type = TOK_STRING});
buf += strlen, len -= strlen;
} else {
assert(false);
}
}
return tokens;
}
void lfreetoks(vec_Token *toks) {
for(size_t i = 0; i < toks->size; i++) {
Token *tok = &toks->data[i];
if(tok->text) {
free(tok->text);
}
}
vec_Token_drop(toks);
}