Debugger: Improvements to C-style expression tokenizer.

- Add recognition of several additional types of tokens that, while not necessary for single line expressions, will be needed for parsing/highlighting source files. Also rename tokens for open/close paren to reflect their content more accurately, and adjust callers accordingly.
2014-11-28 18:14:34 -05:00 · 2014-11-28 18:14:34 -05:00 · 15758369f2
commit 15758369f2
parent 4671959310
3 changed files with 112 additions and 26 deletions
--- a/src/apps/debugger/source_language/expression_evaluators/CLanguageExpressionEvaluator.cpp
+++ b/src/apps/debugger/source_language/expression_evaluators/CLanguageExpressionEvaluator.cpp
@ -69,11 +69,11 @@ static BString TokenTypeToString(int32 type)
 			token = "**";
 			break;

-		case TOKEN_OPENING_BRACKET:
+		case TOKEN_OPENING_PAREN:
 			token = "(";
 			break;

-		case TOKEN_CLOSING_BRACKET:
+		case TOKEN_CLOSING_PAREN:
 			token = ")";
 			break;

@ -1734,11 +1734,11 @@ CLanguageExpressionEvaluator::_ParseAtom()
 	else {
 		fTokenizer->RewindToken();

-		_EatToken(TOKEN_OPENING_BRACKET);
+		_EatToken(TOKEN_OPENING_PAREN);

 		value = _ParseSum();

-		_EatToken(TOKEN_CLOSING_BRACKET);
+		_EatToken(TOKEN_CLOSING_PAREN);
 	}

 	return value;
@ -1765,8 +1765,8 @@ CLanguageExpressionEvaluator::_EatToken(int32 type)
 			case TOKEN_STAR:
 			case TOKEN_MODULO:
 			case TOKEN_POWER:
-			case TOKEN_OPENING_BRACKET:
-			case TOKEN_CLOSING_BRACKET:
+			case TOKEN_OPENING_PAREN:
+			case TOKEN_CLOSING_PAREN:
 			case TOKEN_LOGICAL_AND:
 			case TOKEN_BITWISE_AND:
 			case TOKEN_LOGICAL_OR:
--- a/src/apps/debugger/source_language/expression_evaluators/CLanguageTokenizer.cpp
+++ b/src/apps/debugger/source_language/expression_evaluators/CLanguageTokenizer.cpp
@ -108,7 +108,7 @@ Tokenizer::NextToken()
 			TOKEN_END_OF_LINE);
 	}

-	bool decimal = *fCurrentChar == '.' || *fCurrentChar == ',';
+	bool decimal = *fCurrentChar == '.';

 	if (decimal || isdigit(*fCurrentChar)) {
 		if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
@ -124,14 +124,14 @@ Tokenizer::NextToken()
 			fCurrentChar++;
 		}

-		// optional post comma part
-		// (required if there are no digits before the comma)
-		if (*fCurrentChar == '.' || *fCurrentChar == ',') {
+		// optional post decimal part
+		// (required if there are no digits before the decimal)
+		if (*fCurrentChar == '.') {
 			decimal = true;
 			temp << '.';
 			fCurrentChar++;

-			// optional post comma digits
+			// optional post decimal digits
 			while (isdigit(*fCurrentChar)) {
 				temp << *fCurrentChar;
 				fCurrentChar++;
@ -162,15 +162,27 @@ Tokenizer::NextToken()
 			fCurrentToken.value.SetTo(value);
 		else
 			fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
-	} else if (isalpha(*fCurrentChar)) {
+	} else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
 		const char* begin = fCurrentChar;
 		while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
-			|| isdigit(*fCurrentChar))) {
+			|| isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
 			fCurrentChar++;
 		}
 		int32 length = fCurrentChar - begin;
 		fCurrentToken = Token(begin, length, _CurrentPos() - length,
 			TOKEN_IDENTIFIER);
+	} else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
+		const char* begin = fCurrentChar++;
+		while (*fCurrentChar != 0) {
+			if (*fCurrentChar == '\\') {
+				if (*(fCurrentChar++) != 0)
+					fCurrentChar++;
+			} else if (*(fCurrentChar++) == *begin)
+				break;
+		}
+		int32 length = fCurrentChar - begin;
+		fCurrentToken = Token(begin, length, _CurrentPos() - length,
+			TOKEN_STRING_LITERAL);
 	} else {
 		if (!_ParseOperator()) {
 			int32 type = TOKEN_NONE;
@ -180,15 +192,48 @@ Tokenizer::NextToken()
 					break;

 				case '(':
-					type = TOKEN_OPENING_BRACKET;
+					type = TOKEN_OPENING_PAREN;
 					break;
 				case ')':
-					type = TOKEN_CLOSING_BRACKET;
+					type = TOKEN_CLOSING_PAREN;
+					break;
+
+				case '[':
+					type = TOKEN_OPENING_SQUARE_BRACKET;
+					break;
+				case ']':
+					type = TOKEN_CLOSING_SQUARE_BRACKET;
+					break;
+
+				case '{':
+					type = TOKEN_OPENING_CURLY_BRACE;
+					break;
+				case '}':
+					type = TOKEN_CLOSING_CURLY_BRACE;
 					break;

 				case '\\':
+					type = TOKEN_BACKSLASH;
+					break;
+
 				case ':':
-					type = TOKEN_SLASH;
+					type = TOKEN_COLON;
+					break;
+
+				case ';':
+					type = TOKEN_SEMICOLON;
+					break;
+
+				case ',':
+					type = TOKEN_COMMA;
+					break;
+
+				case '.':
+					type = TOKEN_PERIOD;
+					break;
+
+				case '#':
+					type = TOKEN_POUND;
 					break;

 				default:
@ -227,18 +272,37 @@ Tokenizer::_ParseOperator()
 			break;

 		case '*':
-			if (_Peek() == '*')  {
-				type = TOKEN_POWER;
-				length = 2;
-			} else {
-				type = TOKEN_STAR;
-				length = 1;
+			switch (_Peek()) {
+				case '*':
+					type = TOKEN_POWER;
+					length = 2;
+					break;
+				case '/':
+					type = TOKEN_END_COMMENT_BLOCK;
+					length = 2;
+					break;
+				default:
+					type = TOKEN_STAR;
+					length = 1;
+					break;
 			}
 			break;

 		case '/':
-			type = TOKEN_SLASH;
-			length = 1;
+			switch (_Peek()) {
+				case '*':
+					type = TOKEN_BEGIN_COMMENT_BLOCK;
+					length = 2;
+					break;
+				case '/':
+					type = TOKEN_INLINE_COMMENT;
+					length = 2;
+					break;
+				default:
+					type = TOKEN_SLASH;
+					length = 1;
+					break;
+			}
 			break;

 		case '%':
@ -285,6 +349,9 @@ Tokenizer::_ParseOperator()
 			if (_Peek() == '=') {
 				type = TOKEN_EQ;
 				length = 2;
+			} else {
+				type = TOKEN_ASSIGN;
+				length = 1;
 			}
 			break;

--- a/src/apps/debugger/source_language/expression_evaluators/CLanguageTokenizer.h
+++ b/src/apps/debugger/source_language/expression_evaluators/CLanguageTokenizer.h
@ -35,9 +35,16 @@ enum {

 	TOKEN_POWER,

-	TOKEN_OPENING_BRACKET,
-	TOKEN_CLOSING_BRACKET,
+	TOKEN_OPENING_PAREN,
+	TOKEN_CLOSING_PAREN,

+	TOKEN_OPENING_SQUARE_BRACKET,
+	TOKEN_CLOSING_SQUARE_BRACKET,
+
+	TOKEN_OPENING_CURLY_BRACE,
+	TOKEN_CLOSING_CURLY_BRACE,
+
+	TOKEN_ASSIGN,
 	TOKEN_LOGICAL_AND,
 	TOKEN_LOGICAL_OR,
 	TOKEN_LOGICAL_NOT,
@ -52,6 +59,18 @@ enum {
 	TOKEN_LT,
 	TOKEN_LE,

+	TOKEN_BACKSLASH,
+	TOKEN_COLON,
+	TOKEN_SEMICOLON,
+	TOKEN_COMMA,
+	TOKEN_PERIOD,
+	TOKEN_POUND,
+
+	TOKEN_STRING_LITERAL,
+	TOKEN_BEGIN_COMMENT_BLOCK,
+	TOKEN_END_COMMENT_BLOCK,
+	TOKEN_INLINE_COMMENT,
+
 	TOKEN_MEMBER_PTR
 };