nctref/src/lexer.c
2025-05-03 10:00:20 +03:00

381 lines
6.9 KiB
C

#include"lexer.h"
#include<stdlib.h>
#include<assert.h>
#include<string.h>
#include"reporting.h"
// Comply to same order as in the TokenKind enum from src/lexer.h
char *TOKEN_NAMES[] = {
"identifier",
"'local'",
"EOF",
"number",
"';'",
"':'",
"'if'",
"'('",
"')'",
"'{'",
"'}'",
"'='",
"'+'",
"'-'",
"'*'",
"'/'",
"'extern'",
"'loop'",
"'break'",
"','",
"'&'",
"'|'",
"'^'",
"'~'",
"'=='",
"'['",
"']'",
"'?'",
"string",
"'!='",
"'!'",
"'continue'",
"'return'",
"'->'",
"'<='",
"'>='",
"'<'",
"'>'",
"'*^'",
"'record'",
"'.'",
"'as'",
"'use'",
};
static int isAlpha(int c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
static int isNum(int c) {
return c >= '0' && c <= '9';
}
static int isAlphanum(int c) {
return isAlpha(c) || isNum(c);
}
static int isWS(int c) {
return c == ' ' || c == '\n' || c == '\r' || c == '\b' || c == '\t';
}
static size_t currentRow = 1;
static size_t currentColumn = 0;
static int ungetted = EOF;
int nextc(FILE *f) {
if(ungetted != EOF) {
int ret = ungetted;
ungetted = EOF;
return ret;
}
int c = fgetc(f);
if(c == '\n') {
currentRow++;
currentColumn = 0;
} else if(c != EOF) {
currentColumn++;
}
return c;
}
void pushc(int c, FILE *f) {
ungetted = c;
}
Token nct_tokenize(FILE *f) {
Token tok;
tok.content = NULL;
tok.row = currentRow;
tok.column = currentColumn;
int c = nextc(f);
if(c == EOF) {
tok.type = TOKEN_EOF;
return tok;
}
if(c == ';') {
tok.type = TOKEN_SEMICOLON;
return tok;
} else if(c == ':') {
tok.type = TOKEN_COLON;
return tok;
} else if(c == '(') {
tok.type = TOKEN_PAREN_L;
return tok;
} else if(c == ')') {
tok.type = TOKEN_PAREN_R;
return tok;
} else if(c == '{') {
tok.type = TOKEN_SQUIGGLY_L;
return tok;
} else if(c == '}') {
tok.type = TOKEN_SQUIGGLY_R;
return tok;
} else if(c == '+') {
tok.type = TOKEN_PLUS;
return tok;
} else if(c == '-') {
tok.type = TOKEN_MINUS;
int c = nextc(f);
if(c == '>') {
tok.type = TOKEN_ARROW;
} else ungetc(c, f);
return tok;
} else if(c == '*') {
tok.type = TOKEN_STAR;
int c = nextc(f);
if(c == '^') {
tok.type = TOKEN_STAR_CARET;
} else ungetc(c, f);
return tok;
} else if(c == '&') {
tok.type = TOKEN_AMPERSAND;
return tok;
} else if(c == '|') {
tok.type = TOKEN_VERTICAL_BAR;
return tok;
} else if(c == '^') {
tok.type = TOKEN_CARET;
return tok;
} else if(c == '~') {
tok.type = TOKEN_TILDE;
return tok;
} else if(c == '[') {
tok.type = TOKEN_SQUAREN_L;
return tok;
} else if(c == ']') {
tok.type = TOKEN_SQUAREN_R;
return tok;
} else if(c == '?') {
tok.type = TOKEN_QUESTION_MARK;
return tok;
} else if(c == '!') {
tok.type = TOKEN_EXCLAMATION;
int c = nextc(f);
if(c == '=') {
tok.type = TOKEN_EXCLAMATION_EQUALS;
} else ungetc(c, f);
return tok;
} else if(c == '<') {
tok.type = TOKEN_LESS;
int c = nextc(f);
if(c == '=') {
tok.type = TOKEN_LEQUAL;
} else ungetc(c, f);
return tok;
} else if(c == '>') {
tok.type = TOKEN_GREATER;
int c = nextc(f);
if(c == '=') {
tok.type = TOKEN_GEQUAL;
} else ungetc(c, f);
return tok;
} else if(c == '/') {
int c = nextc(f);
if(c == '*') { /* This is a comment; skip. */
while(1) {
while((c = nextc(f)) != '*');
if(nextc(f) == '/') {
return nct_tokenize(f);
}
}
} else {
ungetc(c, f);
tok.type = TOKEN_SLASH;
return tok;
}
} else if(c == '=') {
tok.type = TOKEN_EQUALS;
int c = nextc(f);
if(c == '=') {
tok.type = TOKEN_DOUBLE_EQUALS;
} else ungetc(c, f);
return tok;
} else if(c == ',') {
tok.type = TOKEN_COMMA;
return tok;
} else if(c == '.') {
tok.type = TOKEN_DOT;
return tok;
} else if(c == '"') {
int capacity = 5;
char *content = malloc(capacity);
size_t i = 0;
int c;
while(c = nextc(f), c != '"') {
if(i == capacity - 1) {
content = realloc(content, capacity += 4);
}
if(c == '\\') {
c = nextc(f);
if(c == '0') c = 0;
else if(c == 'n') c = '\n';
else if(c == 't') c = '\t';
}
content[i++] = c;
}
content[i] = 0;
tok.type = TOKEN_STRING;
tok.content = content;
tok.length = i;
return tok;
} else if(isAlpha(c) || c == '@' || c == '_') {
int capacity = 5;
char *content = malloc(capacity);
size_t i = 0;
content[i++] = c;
while(c = nextc(f), (isAlphanum(c) || c == '@' || c == '_')) {
if(i == capacity - 1) {
content = realloc(content, capacity += 4);
}
content[i++] = c;
}
pushc(c, f);
content[i] = 0;
if(!strcmp(content, "local")) {
free(content);
tok.type = TOKEN_LOCAL;
return tok;
} else if(!strcmp(content, "if")) {
free(content);
tok.type = TOKEN_IF;
return tok;
} else if(!strcmp(content, "extern")) {
free(content);
tok.type = TOKEN_EXTERN;
return tok;
} else if(!strcmp(content, "loop")) {
free(content);
tok.type = TOKEN_LOOP;
return tok;
} else if(!strcmp(content, "break")) {
free(content);
tok.type = TOKEN_BREAK;
return tok;
} else if(!strcmp(content, "continue")) {
free(content);
tok.type = TOKEN_CONTINUE;
return tok;
} else if(!strcmp(content, "return")) {
free(content);
tok.type = TOKEN_RETURN;
return tok;
} else if(!strcmp(content, "record")) {
free(content);
tok.type = TOKEN_RECORD;
return tok;
} else if(!strcmp(content, "as")) {
free(content);
tok.type = TOKEN_AS;
return tok;
} else if(!strcmp(content, "use")) {
free(content);
tok.type = TOKEN_USE;
return tok;
}
tok.type = TOKEN_IDENTIFIER;
tok.content = content;
return tok;
} else if(isNum(c)) {
int capacity = 32;
char *content = malloc(capacity);
size_t i = 0;
content[i++] = c;
while(c = nextc(f), isNum(c)) {
if(i == capacity - 1) {
content = realloc(content, capacity += 4);
}
content[i++] = c;
}
content[i] = 0;
int base = strtol(content, NULL, 10);
if(c == 'r') {
content[i++] = c;
while(c = nextc(f), (isNum(c) || (base > 10 && c >= 'A' && c < ('A' + base - 10)))) {
if(i == 31) {
stahp_token(&tok, "Numbers have a maximum size of 31.");
}
content[i++] = c;
}
}
pushc(c, f);
tok.type = TOKEN_NUMBER;
tok.content = content;
return tok;
} else if(isWS(c)) {
int c;
while(c = nextc(f), isWS(c)) {
}
pushc(c, f);
return nct_tokenize(f);
}
stahp(currentRow, currentColumn, "Invalid character '%c' (byte %i)", c, c);
}
Token *nct_lex(FILE *f) {
size_t length = 8, index = 0;
Token *list = malloc(sizeof(*list) * length);
while(1) {
list[index] = nct_tokenize(f);
if(list[index].type == TOKEN_EOF) {
return list;
}
index++;
if(index == length) {
length *= 2;
list = realloc(list, sizeof(*list) * length);
}
}
return NULL; /* Doesn't reach here. */
}
void nct_lex_free(Token *tokens) {
for(Token *t = tokens; t->type != TOKEN_EOF; t++) {
if(t->content) free(t->content);
}
free(tokens);
}