kuroko/scanner.c

#include <assert.h>
#include <stdio.h>
#include <string.h>

#include "kuroko.h"
#include "scanner.h"

typedef struct {
	const char * start;
	const char * cur;
	const char * linePtr;
	size_t line;
	int startOfLine;
	int hasUnget;
	KrkToken unget;
} KrkScanner;

KrkScanner scanner;

void krk_initScanner(const char * src) {
	scanner.start = src;
	scanner.cur   = src;
	scanner.line  = 1;
	scanner.linePtr   = src;
	scanner.startOfLine = 1;
	scanner.hasUnget = 0;
	/* file, etc. ? */
}

static int isAtEnd() {
	return *scanner.cur == '\0';
}

static void nextLine() {
	scanner.line++;
	scanner.linePtr = scanner.cur;
}

static KrkToken makeToken(KrkTokenType type) {
	return (KrkToken){
		.type = type,
		.start = scanner.start,
		.length = (type == TOKEN_EOL) ? 0 : (size_t)(scanner.cur - scanner.start),
		.line = scanner.line,
		.linePtr = scanner.linePtr,
		.col = (scanner.start - scanner.linePtr) + 1,
	};
}

static KrkToken errorToken(const char * errorStr) {
	return (KrkToken){
		.type = TOKEN_ERROR,
		.start = errorStr,
		.length = strlen(errorStr),
		.line = scanner.line,
		.linePtr = scanner.linePtr,
		.col = (scanner.start - scanner.linePtr) + 1,
	};
}

static char advance() {
	return (*scanner.cur == '\0') ? '\0' : *(scanner.cur++);
}

static int match(char expected) {
	if (isAtEnd()) return 0;
	if (*scanner.cur != expected) return 0;
	scanner.cur++;
	return 1;
}

static char peek() {
	return *scanner.cur;
}

static char peekNext() {
	if (isAtEnd()) return '\0';
	return scanner.cur[1];
}

static void skipWhitespace() {
	for (;;) {
		char c = peek();
		switch (c) {
			case ' ':
			case '\t':
				advance();
				break;
			default:
				return;
		}
	}
}

static KrkToken makeIndentation() {
	while (!isAtEnd() && peek() == ' ') advance();
	if (isAtEnd()) return makeToken(TOKEN_EOF);
	if (peek() == '\n') {
		/* Pretend we didn't see this line */
		return makeToken(TOKEN_INDENTATION);
	}
	if (peek() == '#') {
		KrkToken out = makeToken(TOKEN_INDENTATION);
		while (!isAtEnd() && peek() != '\n') advance();
		return out;
	}
	return makeToken(TOKEN_INDENTATION);
}

static KrkToken string() {
	while (peek() != '"' && !isAtEnd()) {
		if (peek() == '\\') advance(); /* Advance twice */
		if (peek() == '\n') nextLine();
		advance();
	}

	if (isAtEnd()) return errorToken("Unterminated string.");

	assert(peek() == '"');
	advance();

	return makeToken(TOKEN_STRING);
}

static KrkToken codepoint() {
	while (peek() != '\'' && !isAtEnd()) {
		if (peek() == '\\') advance();
		if (peek() == '\n') return makeToken(TOKEN_RETRY);
		advance();
	}

	if (isAtEnd()) return errorToken("Unterminated codepoint literal.");

	assert(peek() == '\'');
	advance();

	return makeToken(TOKEN_CODEPOINT);
}

static int isDigit(char c) {
	return c >= '0' && c <= '9';
}

static KrkToken number(char c) {
	if (c == '0') {
		if (peek() == 'x' || peek() == 'X') {
			/* Hexadecimal */
			advance();
			while (isDigit(peek()) || (peek() >= 'a' && peek() <= 'f') ||
			       (peek() >= 'A' && peek() <= 'F')) advance();
			return makeToken(TOKEN_NUMBER);
		} else if (peek() == 'b' || peek() == 'B') {
			/* Binary */
			advance();
			while (peek() == '0' || peek() == '1') advance();
			return makeToken(TOKEN_NUMBER);
		} if (peek() == 'o' || peek() == 'O') {
			/* Octal - must be 0o, none of those silly 0123 things */
			advance();
			while (peek() >= '0' && peek() <= '7') advance();
			return makeToken(TOKEN_NUMBER);
		}
		/* Otherwise, decimal and maybe 0.123 floating */
	}

	/* Decimal */
	while (isDigit(peek())) advance();

	/* Floating point */
	if (peek() == '.' && isDigit(peekNext())) {
		advance();
		while (isDigit(peek())) advance();
	}

	return makeToken(TOKEN_NUMBER);
}

static int isAlpha(char c) {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
}

static int checkKeyword(size_t start, const char * rest, KrkTokenType type) {
	size_t length = strlen(rest);
	if ((size_t)(scanner.cur - scanner.start) == start + length &&
		memcmp(scanner.start + start, rest, length) == 0) return type;
	return TOKEN_IDENTIFIER;
}

static KrkTokenType identifierType() {
#define MORE(i) (scanner.cur - scanner.start > i)
	switch (*scanner.start) {
		case 'a': return checkKeyword(1, "nd", TOKEN_AND);
		case 'c': return checkKeyword(1, "lass", TOKEN_CLASS);
		case 'd': return checkKeyword(1, "ef", TOKEN_DEF);
		case 'e': if (MORE(1)) switch(scanner.start[1]) {
			case 'l': return checkKeyword(2, "se", TOKEN_ELSE);
			case 'x': if MORE(2) switch(scanner.start[2]) {
				case 'p': return checkKeyword(3, "ort", TOKEN_EXPORT);
				case 'c': return checkKeyword(3, "ept", TOKEN_EXCEPT);
			} break;
		} break;
		case 'f': return checkKeyword(1, "or", TOKEN_FOR);
		case 'F': return checkKeyword(1, "alse", TOKEN_FALSE);
		case 'i': if (MORE(1)) switch (scanner.start[1]) {
			case 'f': return checkKeyword(2, "", TOKEN_IF);
			case 'n': return checkKeyword(2, "", TOKEN_IN);
			case 'm': return checkKeyword(2, "port", TOKEN_IMPORT);
		} break;
		case 'l': return checkKeyword(1, "et", TOKEN_LET);
		case 'n': return checkKeyword(1, "ot", TOKEN_NOT);
		case 'N': return checkKeyword(1, "one", TOKEN_NONE);
		case 'o': return checkKeyword(1, "r", TOKEN_OR);
		case 'p': return checkKeyword(1, "rint", TOKEN_PRINT);
		case 'r': if (MORE(1)) switch (scanner.start[1]) {
			case 'e': return checkKeyword(2, "turn", TOKEN_RETURN);
			case 'a': return checkKeyword(2, "ise", TOKEN_RAISE);
		} break;
		case 's': if (MORE(1)) switch(scanner.start[1]) {
			case 'e': return checkKeyword(2, "lf", TOKEN_SELF);
			case 'u': return checkKeyword(2, "per", TOKEN_SUPER);
		} break;
		case 't': return checkKeyword(1, "ry", TOKEN_TRY);
		case 'T': return checkKeyword(1, "rue", TOKEN_TRUE);
		case 'w': return checkKeyword(1, "hile", TOKEN_WHILE);
	}
	return TOKEN_IDENTIFIER;
}

static KrkToken identifier() {
	while (isAlpha(peek()) || isDigit(peek())) advance();

	return makeToken(identifierType());
}

void krk_ungetToken(KrkToken token) {
	if (scanner.hasUnget) {
		fprintf(stderr, "(internal error) Tried to unget multiple times, this is not valid.\n");
		exit(1);
	}
	scanner.hasUnget = 1;
	scanner.unget = token;
}


KrkToken krk_scanToken() {

	if (scanner.hasUnget) {
		scanner.hasUnget = 0;
		return scanner.unget;
	}

	/* If at start of line, do thing */
	if (scanner.startOfLine && peek() == ' ') {
		scanner.start = scanner.cur;
		scanner.startOfLine = 0;
		return makeIndentation();
	}

	/* Eat whitespace */
	skipWhitespace();

	/* Skip comments */
	if (peek() == '#') while (peek() != '\n' && !isAtEnd()) advance();

	scanner.start = scanner.cur;
	if (isAtEnd()) return makeToken(TOKEN_EOF);

	char c = advance();

	if (c == '\n') {
		KrkToken out;
		if (scanner.startOfLine) {
			/* Ignore completely blank lines */
			out = makeToken(TOKEN_RETRY);
		} else {
			scanner.startOfLine = 1;
			out = makeToken(TOKEN_EOL);
		}
		nextLine();
		return out;
	}

	/* Not indentation, not a linefeed on an empty line, must be not be start of line any more */
	scanner.startOfLine = 0;

	if (isAlpha(c)) return identifier();
	if (isDigit(c)) return number(c);

	switch (c) {
		case '(': return makeToken(TOKEN_LEFT_PAREN);
		case ')': return makeToken(TOKEN_RIGHT_PAREN);
		case '{': return makeToken(TOKEN_LEFT_BRACE);
		case '}': return makeToken(TOKEN_RIGHT_BRACE);
		case '[': return makeToken(TOKEN_LEFT_SQUARE);
		case ']': return makeToken(TOKEN_RIGHT_SQUARE);
		case ':': return makeToken(TOKEN_COLON);
		case ',': return makeToken(TOKEN_COMMA);
		case '.': return makeToken(TOKEN_DOT);
		case '-': return makeToken(TOKEN_MINUS);
		case '+': return makeToken(TOKEN_PLUS);
		case ';': return makeToken(TOKEN_SEMICOLON);
		case '/': return makeToken(TOKEN_SOLIDUS);
		case '*': return makeToken(TOKEN_ASTERISK);
		case '%': return makeToken(TOKEN_MODULO);

		case '!': return makeToken(match('=') ? TOKEN_BANG_EQUAL : TOKEN_BANG);
		case '=': return makeToken(match('=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL);
		case '<': return makeToken(match('=') ? TOKEN_LESS_EQUAL : TOKEN_LESS);
		case '>': return makeToken(match('=') ? TOKEN_GREATER_EQUAL : TOKEN_GREATER);

		case '"': return string();
		case '\'': return codepoint();
	}


	return errorToken("Unexpected character.");
}