nctref/src/lexer.c

#include"lexer.h"
#include<stdlib.h>
#include<assert.h>
#include<string.h>
#include"reporting.h"

// Comply to same order as in the TokenKind enum from src/lexer.h
char *TOKEN_NAMES[] = {
	"identifier",
	"'local'",
	"EOF",
	"number",
	"';'",
	"':'",
	"'if'",
	"'('",
	"')'",
	"'{'",
	"'}'",
	"'='",
	"'+'",
	"'-'",
	"'*'",
	"'/'",
	"'extern'",
	"'loop'",
	"'break'",
	"','",
	"'&'",
	"'|'",
	"'^'",
	"'~'",
	"'=='",
	"'['",
	"']'",
	"'?'",
	"string",
	"'!='",
	"'!'",
	"'continue'",
	"'return'",
	"'->'",
	"'<='",
	"'>='",
	"'<'",
	"'>'",
	"'*^'",
	"'record'",
	"'.'",
	"'as'",
	"'use'",
};

static int isAlpha(int c) {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

static int isNum(int c) {
	return c >= '0' && c <= '9';
}

static int isAlphanum(int c) {
	return isAlpha(c) || isNum(c);
}

static int isWS(int c) {
	return c == ' ' || c == '\n' || c == '\r' || c == '\b' || c == '\t';
}

static size_t currentRow = 1;
static size_t currentColumn = 0;
static int ungetted = EOF;

int nextc(FILE *f) {
	if(ungetted != EOF) {
		int ret = ungetted;
		ungetted = EOF;
		return ret;
	}

	int c = fgetc(f);
	if(c == '\n') {
		currentRow++;
		currentColumn = 0;
	} else if(c != EOF) {
		currentColumn++;
	}
	return c;
}

void pushc(int c, FILE *f) {
	ungetted = c;
}

Token nct_tokenize(FILE *f) {
	Token tok;
	tok.content = NULL;
	tok.row = currentRow;
	tok.column = currentColumn;

	int c = nextc(f);

	if(c == EOF) {
		tok.type = TOKEN_EOF;
		return tok;
	}

	if(c == ';') {
		tok.type = TOKEN_SEMICOLON;
		return tok;
	} else if(c == ':') {
		tok.type = TOKEN_COLON;
		return tok;
	} else if(c == '(') {
		tok.type = TOKEN_PAREN_L;
		return tok;
	} else if(c == ')') {
		tok.type = TOKEN_PAREN_R;
		return tok;
	} else if(c == '{') {
		tok.type = TOKEN_SQUIGGLY_L;
		return tok;
	} else if(c == '}') {
		tok.type = TOKEN_SQUIGGLY_R;
		return tok;
	} else if(c == '+') {
		tok.type = TOKEN_PLUS;
		return tok;
	} else if(c == '-') {
		tok.type = TOKEN_MINUS;
		int c = nextc(f);
		if(c == '>') {
			tok.type = TOKEN_ARROW;
		} else ungetc(c, f);
		return tok;
	} else if(c == '*') {
		tok.type = TOKEN_STAR;
		int c = nextc(f);
		if(c == '^') {
			tok.type = TOKEN_STAR_CARET;
		} else ungetc(c, f);
		return tok;
	} else if(c == '&') {
		tok.type = TOKEN_AMPERSAND;
		return tok;
	} else if(c == '|') {
		tok.type = TOKEN_VERTICAL_BAR;
		return tok;
	} else if(c == '^') {
		tok.type = TOKEN_CARET;
		return tok;
	} else if(c == '~') {
		tok.type = TOKEN_TILDE;
		return tok;
	} else if(c == '[') {
		tok.type = TOKEN_SQUAREN_L;
		return tok;
	} else if(c == ']') {
		tok.type = TOKEN_SQUAREN_R;
		return tok;
	} else if(c == '?') {
		tok.type = TOKEN_QUESTION_MARK;
		return tok;
	} else if(c == '!') {
		tok.type = TOKEN_EXCLAMATION;
		int c = nextc(f);
		if(c == '=') {
			tok.type = TOKEN_EXCLAMATION_EQUALS;
		} else ungetc(c, f);
		return tok;
	} else if(c == '<') {
		tok.type = TOKEN_LESS;
		int c = nextc(f);
		if(c == '=') {
			tok.type = TOKEN_LEQUAL;
		} else ungetc(c, f);
		return tok;
	} else if(c == '>') {
		tok.type = TOKEN_GREATER;
		int c = nextc(f);
		if(c == '=') {
			tok.type = TOKEN_GEQUAL;
		} else ungetc(c, f);
		return tok;
	} else if(c == '/') {
		int c = nextc(f);
		if(c == '*') { /* This is a comment; skip. */
			while(1) {
				while((c = nextc(f)) != '*');
				if(nextc(f) == '/') {
					return nct_tokenize(f);
				}
			}
		} else {
			ungetc(c, f);
			tok.type = TOKEN_SLASH;
			return tok;
		}
	} else if(c == '=') {
		tok.type = TOKEN_EQUALS;
		int c = nextc(f);
		if(c == '=') {
			tok.type = TOKEN_DOUBLE_EQUALS;
		} else ungetc(c, f);
		return tok;
	} else if(c == ',') {
		tok.type = TOKEN_COMMA;
		return tok;
	} else if(c == '.') {
		tok.type = TOKEN_DOT;
		return tok;
	} else if(c == '"') {
		int capacity = 5;
		char *content = malloc(capacity);

		size_t i = 0;
		int c;
		while(c = nextc(f), c != '"') {
			if(i == capacity - 1) {
				content = realloc(content, capacity += 4);
			}

			if(c == '\\') {
				c = nextc(f);

				if(c == '0') c = 0;
				else if(c == 'n') c = '\n';
				else if(c == 'r') c = '\r';
				else if(c == 't') c = '\t';
			}

			content[i++] = c;
		}

		content[i] = 0;

		tok.type = TOKEN_STRING;
		tok.content = content;
		tok.length = i;
		return tok;
	} else if(isAlpha(c) || c == '@' || c == '_') {
		int capacity = 5;
		char *content = malloc(capacity);

		size_t i = 0;
		content[i++] = c;

		while(c = nextc(f), (isAlphanum(c) || c == '@' || c == '_')) {
			if(i == capacity - 1) {
				content = realloc(content, capacity += 4);
			}

			content[i++] = c;
		}

		pushc(c, f);

		content[i] = 0;
		if(!strcmp(content, "local")) {
			free(content);
			tok.type = TOKEN_LOCAL;
			return tok;
		} else if(!strcmp(content, "if")) {
			free(content);
			tok.type = TOKEN_IF;
			return tok;
		} else if(!strcmp(content, "extern")) {
			free(content);
			tok.type = TOKEN_EXTERN;
			return tok;
		} else if(!strcmp(content, "loop")) {
			free(content);
			tok.type = TOKEN_LOOP;
			return tok;
		} else if(!strcmp(content, "break")) {
			free(content);
			tok.type = TOKEN_BREAK;
			return tok;
		} else if(!strcmp(content, "continue")) {
			free(content);
			tok.type = TOKEN_CONTINUE;
			return tok;
		} else if(!strcmp(content, "return")) {
			free(content);
			tok.type = TOKEN_RETURN;
			return tok;
		} else if(!strcmp(content, "record")) {
			free(content);
			tok.type = TOKEN_RECORD;
			return tok;
		} else if(!strcmp(content, "as")) {
			free(content);
			tok.type = TOKEN_AS;
			return tok;
		} else if(!strcmp(content, "use")) {
			free(content);
			tok.type = TOKEN_USE;
			return tok;
		}

		tok.type = TOKEN_IDENTIFIER;
		tok.content = content;
		return tok;
	} else if(isNum(c)) {
		int capacity = 32;
		char *content = malloc(capacity);

		size_t i = 0;
		content[i++] = c;

		while(c = nextc(f), isNum(c)) {
			if(i == capacity - 1) {
				content = realloc(content, capacity += 4);
			}

			content[i++] = c;
		}

		content[i] = 0;

		int base = strtol(content, NULL, 10);

		if(c == 'r') {
			content[i++] = c;

			while(c = nextc(f), (isNum(c) || (base > 10 && c >= 'A' && c < ('A' + base - 10)))) {
				if(i == 31) {
					stahp_token(&tok, "Numbers have a maximum size of 31.");
				}

				content[i++] = c;
			}
		}

		pushc(c, f);

		tok.type = TOKEN_NUMBER;
		tok.content = content;
		return tok;
	} else if(isWS(c)) {
		int c;

		while(c = nextc(f), isWS(c)) {
		}

		pushc(c, f);

		return nct_tokenize(f);
	}

	stahp(currentRow, currentColumn, "Invalid character '%c' (byte %i)", c, c);
}

Token *nct_lex(FILE *f) {
	size_t length = 8, index = 0;
	Token *list = malloc(sizeof(*list) * length);

	currentRow = 1;
	currentColumn = 0;

	while(1) {
		list[index] = nct_tokenize(f);

		if(list[index].type == TOKEN_EOF) {
			return list;
		}

		index++;

		if(index == length) {
			length *= 2;
			list = realloc(list, sizeof(*list) * length);
		}
	}

	return NULL; /* Doesn't reach here. */
}

void nct_lex_free(Token *tokens) {
	for(Token *t = tokens; t->type != TOKEN_EOF; t++) {
		if(t->content) free(t->content);
	}
	free(tokens);
}