381 lines
6.9 KiB
C
381 lines
6.9 KiB
C
#include"lexer.h"
|
|
#include<stdlib.h>
|
|
#include<assert.h>
|
|
#include<string.h>
|
|
#include"reporting.h"
|
|
|
|
// Comply to same order as in the TokenKind enum from src/lexer.h
|
|
char *TOKEN_NAMES[] = {
|
|
"identifier",
|
|
"'local'",
|
|
"EOF",
|
|
"number",
|
|
"';'",
|
|
"':'",
|
|
"'if'",
|
|
"'('",
|
|
"')'",
|
|
"'{'",
|
|
"'}'",
|
|
"'='",
|
|
"'+'",
|
|
"'-'",
|
|
"'*'",
|
|
"'/'",
|
|
"'extern'",
|
|
"'loop'",
|
|
"'break'",
|
|
"','",
|
|
"'&'",
|
|
"'|'",
|
|
"'^'",
|
|
"'~'",
|
|
"'=='",
|
|
"'['",
|
|
"']'",
|
|
"'?'",
|
|
"string",
|
|
"'!='",
|
|
"'!'",
|
|
"'continue'",
|
|
"'return'",
|
|
"'->'",
|
|
"'<='",
|
|
"'>='",
|
|
"'<'",
|
|
"'>'",
|
|
"'*^'",
|
|
"'record'",
|
|
"'.'",
|
|
"'as'",
|
|
"'use'",
|
|
};
|
|
|
|
static int isAlpha(int c) {
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
|
}
|
|
|
|
static int isNum(int c) {
|
|
return c >= '0' && c <= '9';
|
|
}
|
|
|
|
static int isAlphanum(int c) {
|
|
return isAlpha(c) || isNum(c);
|
|
}
|
|
|
|
static int isWS(int c) {
|
|
return c == ' ' || c == '\n' || c == '\r' || c == '\b' || c == '\t';
|
|
}
|
|
|
|
static size_t currentRow = 1;
|
|
static size_t currentColumn = 0;
|
|
static int ungetted = EOF;
|
|
|
|
int nextc(FILE *f) {
|
|
if(ungetted != EOF) {
|
|
int ret = ungetted;
|
|
ungetted = EOF;
|
|
return ret;
|
|
}
|
|
|
|
int c = fgetc(f);
|
|
if(c == '\n') {
|
|
currentRow++;
|
|
currentColumn = 0;
|
|
} else if(c != EOF) {
|
|
currentColumn++;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
void pushc(int c, FILE *f) {
|
|
ungetted = c;
|
|
}
|
|
|
|
Token nct_tokenize(FILE *f) {
|
|
Token tok;
|
|
tok.content = NULL;
|
|
tok.row = currentRow;
|
|
tok.column = currentColumn;
|
|
|
|
int c = nextc(f);
|
|
|
|
if(c == EOF) {
|
|
tok.type = TOKEN_EOF;
|
|
return tok;
|
|
}
|
|
|
|
if(c == ';') {
|
|
tok.type = TOKEN_SEMICOLON;
|
|
return tok;
|
|
} else if(c == ':') {
|
|
tok.type = TOKEN_COLON;
|
|
return tok;
|
|
} else if(c == '(') {
|
|
tok.type = TOKEN_PAREN_L;
|
|
return tok;
|
|
} else if(c == ')') {
|
|
tok.type = TOKEN_PAREN_R;
|
|
return tok;
|
|
} else if(c == '{') {
|
|
tok.type = TOKEN_SQUIGGLY_L;
|
|
return tok;
|
|
} else if(c == '}') {
|
|
tok.type = TOKEN_SQUIGGLY_R;
|
|
return tok;
|
|
} else if(c == '+') {
|
|
tok.type = TOKEN_PLUS;
|
|
return tok;
|
|
} else if(c == '-') {
|
|
tok.type = TOKEN_MINUS;
|
|
int c = nextc(f);
|
|
if(c == '>') {
|
|
tok.type = TOKEN_ARROW;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == '*') {
|
|
tok.type = TOKEN_STAR;
|
|
int c = nextc(f);
|
|
if(c == '^') {
|
|
tok.type = TOKEN_STAR_CARET;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == '&') {
|
|
tok.type = TOKEN_AMPERSAND;
|
|
return tok;
|
|
} else if(c == '|') {
|
|
tok.type = TOKEN_VERTICAL_BAR;
|
|
return tok;
|
|
} else if(c == '^') {
|
|
tok.type = TOKEN_CARET;
|
|
return tok;
|
|
} else if(c == '~') {
|
|
tok.type = TOKEN_TILDE;
|
|
return tok;
|
|
} else if(c == '[') {
|
|
tok.type = TOKEN_SQUAREN_L;
|
|
return tok;
|
|
} else if(c == ']') {
|
|
tok.type = TOKEN_SQUAREN_R;
|
|
return tok;
|
|
} else if(c == '?') {
|
|
tok.type = TOKEN_QUESTION_MARK;
|
|
return tok;
|
|
} else if(c == '!') {
|
|
tok.type = TOKEN_EXCLAMATION;
|
|
int c = nextc(f);
|
|
if(c == '=') {
|
|
tok.type = TOKEN_EXCLAMATION_EQUALS;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == '<') {
|
|
tok.type = TOKEN_LESS;
|
|
int c = nextc(f);
|
|
if(c == '=') {
|
|
tok.type = TOKEN_LEQUAL;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == '>') {
|
|
tok.type = TOKEN_GREATER;
|
|
int c = nextc(f);
|
|
if(c == '=') {
|
|
tok.type = TOKEN_GEQUAL;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == '/') {
|
|
int c = nextc(f);
|
|
if(c == '*') { /* This is a comment; skip. */
|
|
while(1) {
|
|
while((c = nextc(f)) != '*');
|
|
if(nextc(f) == '/') {
|
|
return nct_tokenize(f);
|
|
}
|
|
}
|
|
} else {
|
|
ungetc(c, f);
|
|
tok.type = TOKEN_SLASH;
|
|
return tok;
|
|
}
|
|
} else if(c == '=') {
|
|
tok.type = TOKEN_EQUALS;
|
|
int c = nextc(f);
|
|
if(c == '=') {
|
|
tok.type = TOKEN_DOUBLE_EQUALS;
|
|
} else ungetc(c, f);
|
|
return tok;
|
|
} else if(c == ',') {
|
|
tok.type = TOKEN_COMMA;
|
|
return tok;
|
|
} else if(c == '.') {
|
|
tok.type = TOKEN_DOT;
|
|
return tok;
|
|
} else if(c == '"') {
|
|
int capacity = 5;
|
|
char *content = malloc(capacity);
|
|
|
|
size_t i = 0;
|
|
int c;
|
|
while(c = nextc(f), c != '"') {
|
|
if(i == capacity - 1) {
|
|
content = realloc(content, capacity += 4);
|
|
}
|
|
|
|
if(c == '\\') {
|
|
c = nextc(f);
|
|
|
|
if(c == '0') c = 0;
|
|
else if(c == 'n') c = '\n';
|
|
else if(c == 't') c = '\t';
|
|
}
|
|
|
|
content[i++] = c;
|
|
}
|
|
|
|
content[i] = 0;
|
|
|
|
tok.type = TOKEN_STRING;
|
|
tok.content = content;
|
|
tok.length = i;
|
|
return tok;
|
|
} else if(isAlpha(c) || c == '@' || c == '_') {
|
|
int capacity = 5;
|
|
char *content = malloc(capacity);
|
|
|
|
size_t i = 0;
|
|
content[i++] = c;
|
|
|
|
while(c = nextc(f), (isAlphanum(c) || c == '@' || c == '_')) {
|
|
if(i == capacity - 1) {
|
|
content = realloc(content, capacity += 4);
|
|
}
|
|
|
|
content[i++] = c;
|
|
}
|
|
|
|
pushc(c, f);
|
|
|
|
content[i] = 0;
|
|
if(!strcmp(content, "local")) {
|
|
free(content);
|
|
tok.type = TOKEN_LOCAL;
|
|
return tok;
|
|
} else if(!strcmp(content, "if")) {
|
|
free(content);
|
|
tok.type = TOKEN_IF;
|
|
return tok;
|
|
} else if(!strcmp(content, "extern")) {
|
|
free(content);
|
|
tok.type = TOKEN_EXTERN;
|
|
return tok;
|
|
} else if(!strcmp(content, "loop")) {
|
|
free(content);
|
|
tok.type = TOKEN_LOOP;
|
|
return tok;
|
|
} else if(!strcmp(content, "break")) {
|
|
free(content);
|
|
tok.type = TOKEN_BREAK;
|
|
return tok;
|
|
} else if(!strcmp(content, "continue")) {
|
|
free(content);
|
|
tok.type = TOKEN_CONTINUE;
|
|
return tok;
|
|
} else if(!strcmp(content, "return")) {
|
|
free(content);
|
|
tok.type = TOKEN_RETURN;
|
|
return tok;
|
|
} else if(!strcmp(content, "record")) {
|
|
free(content);
|
|
tok.type = TOKEN_RECORD;
|
|
return tok;
|
|
} else if(!strcmp(content, "as")) {
|
|
free(content);
|
|
tok.type = TOKEN_AS;
|
|
return tok;
|
|
} else if(!strcmp(content, "use")) {
|
|
free(content);
|
|
tok.type = TOKEN_USE;
|
|
return tok;
|
|
}
|
|
|
|
tok.type = TOKEN_IDENTIFIER;
|
|
tok.content = content;
|
|
return tok;
|
|
} else if(isNum(c)) {
|
|
int capacity = 32;
|
|
char *content = malloc(capacity);
|
|
|
|
size_t i = 0;
|
|
content[i++] = c;
|
|
|
|
while(c = nextc(f), isNum(c)) {
|
|
if(i == capacity - 1) {
|
|
content = realloc(content, capacity += 4);
|
|
}
|
|
|
|
content[i++] = c;
|
|
}
|
|
|
|
content[i] = 0;
|
|
|
|
int base = strtol(content, NULL, 10);
|
|
|
|
if(c == 'r') {
|
|
content[i++] = c;
|
|
|
|
while(c = nextc(f), (isNum(c) || (base > 10 && c >= 'A' && c < ('A' + base - 10)))) {
|
|
if(i == 31) {
|
|
stahp_token(&tok, "Numbers have a maximum size of 31.");
|
|
}
|
|
|
|
content[i++] = c;
|
|
}
|
|
}
|
|
|
|
pushc(c, f);
|
|
|
|
tok.type = TOKEN_NUMBER;
|
|
tok.content = content;
|
|
return tok;
|
|
} else if(isWS(c)) {
|
|
int c;
|
|
|
|
while(c = nextc(f), isWS(c)) {
|
|
}
|
|
|
|
pushc(c, f);
|
|
|
|
return nct_tokenize(f);
|
|
}
|
|
|
|
stahp(currentRow, currentColumn, "Invalid character '%c' (byte %i)", c, c);
|
|
}
|
|
|
|
Token *nct_lex(FILE *f) {
|
|
size_t length = 8, index = 0;
|
|
Token *list = malloc(sizeof(*list) * length);
|
|
|
|
while(1) {
|
|
list[index] = nct_tokenize(f);
|
|
|
|
if(list[index].type == TOKEN_EOF) {
|
|
return list;
|
|
}
|
|
|
|
index++;
|
|
|
|
if(index == length) {
|
|
length *= 2;
|
|
list = realloc(list, sizeof(*list) * length);
|
|
}
|
|
}
|
|
|
|
return NULL; /* Doesn't reach here. */
|
|
}
|
|
|
|
void nct_lex_free(Token *tokens) {
|
|
for(Token *t = tokens; t->type != TOKEN_EOF; t++) {
|
|
if(t->content) free(t->content);
|
|
}
|
|
free(tokens);
|
|
}
|