Add support for \U escape

This commit is contained in:
K. Lange 2021-01-19 13:58:55 +09:00
parent dd22d86abe
commit 87c99d5c8f
6 changed files with 73 additions and 64 deletions

View File

@ -1688,6 +1688,28 @@ static void string(int type) {
stringBytes = GROW_ARRAY(char, stringBytes, old, stringCapacity); \
} stringBytes[stringLength++] = c; } while (0)
#define PUSH_HEX(n, type) do { \
char tmpbuf[10] = {0}; \
for (size_t i = 0; i < n; ++i) { \
if (c + i + 2 == end || !isHex(c[i+2])) { \
error("truncated \\%c escape", type); \
return; \
} \
tmpbuf[i] = c[i+2]; \
} \
unsigned long value = strtoul(tmpbuf, NULL, 16); \
if (value >= 0x110000) { \
error("invalid codepoint in \\%c escape", type); \
} \
if (isBytes) { \
PUSH_CHAR(value); \
break; \
} \
unsigned char bytes[5] = {0}; \
size_t len = krk_codepointToBytes(value, bytes); \
for (size_t i = 0; i < len; i++) PUSH_CHAR(bytes[i]); \
} while (0)
int isBytes = (parser.previous.type == TOKEN_PREFIX_B);
if (isBytes && !(match(TOKEN_STRING) || match(TOKEN_BIG_STRING))) {
error("Expected string after 'b' prefix?");
@ -1713,19 +1735,7 @@ static void string(int type) {
case 'v': PUSH_CHAR('\v'); break;
case '[': PUSH_CHAR('\033'); break;
case 'x': {
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3])) {
error("invalid \\x escape");
return;
}
unsigned long value = strtoul((char[]){c[2],c[3],'\0'}, NULL, 16);
if (isBytes) {
PUSH_CHAR(value);
} else if (value > 127) {
PUSH_CHAR((0xC0 | (value >> 6)));
PUSH_CHAR((0x80 | (value & 0x3F)));
} else {
PUSH_CHAR(value);
}
PUSH_HEX(2,'x');
c += 2;
} break;
case 'u': {
@ -1733,30 +1743,19 @@ static void string(int type) {
PUSH_CHAR(c[0]);
PUSH_CHAR(c[1]);
} else {
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3]) ||
c+4 == end || c+5 == end || !isHex(c[4]) || !isHex(c[5])) {
error("truncated \\u escape");
return;
}
unsigned long value = strtoul((char[]){c[2],c[3],c[4],c[5],'\0'}, NULL, 16);
if (value > 0xFFFF) {
PUSH_CHAR((0xF0 | (value >> 18)));
PUSH_CHAR((0x80 | ((value >> 12) & 0x3F)));
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
PUSH_CHAR((0x80 | ((value) & 0x3F)));
} else if (value > 0x7FF) {
PUSH_CHAR((0xE0 | (value >> 12)));
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
PUSH_CHAR((0x80 | (value & 0x3F)));
} else if (value > 0x7F) {
PUSH_CHAR((0xC0 | (value >> 6)));
PUSH_CHAR((0x80 | (value & 0x3F)));
} else {
PUSH_CHAR(value);
}
PUSH_HEX(4,'u');
c += 4;
}
} break;
case 'U': {
if (isBytes) {
PUSH_CHAR(c[0]);
PUSH_CHAR(c[1]);
} else {
PUSH_HEX(8,'U');
c += 8;
}
} break;
case '\n': break;
default:
/* TODO octal */

View File

@ -175,6 +175,8 @@ class KurokoHighlighter(State):
self.paint(4, FLAG_ESCAPE)
else if self.nextchar() == 'u':
self.paint(6, FLAG_ESCAPE)
else if self.nextchar() == 'U':
self.paint(10, FLAG_ESCAPE)
else if self.nextchar() == None:
self.paint(1, FLAG_ESCAPE)
return int(strType == "'") + 3

View File

@ -20,6 +20,28 @@ static KrkObj * allocateObject(size_t size, ObjType type) {
return object;
}
size_t krk_codepointToBytes(krk_integer_type value, unsigned char * out) {
if (value > 0xFFFF) {
out[0] = (0xF0 | (value >> 18));
out[1] = (0x80 | ((value >> 12) & 0x3F));
out[2] = (0x80 | ((value >> 6) & 0x3F));
out[3] = (0x80 | ((value) & 0x3F));
return 4;
} else if (value > 0x7FF) {
out[0] = (0xE0 | (value >> 12));
out[1] = (0x80 | ((value >> 6) & 0x3F));
out[2] = (0x80 | (value & 0x3F));
return 3;
} else if (value > 0x7F) {
out[0] = (0xC0 | (value >> 6));
out[1] = (0x80 | (value & 0x3F));
return 2;
} else {
out[0] = (unsigned char)value;
return 1;
}
}
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1

View File

@ -202,3 +202,4 @@ extern uint32_t krk_unicodeCodepoint(KrkString * string, size_t index);
extern KrkBytes * krk_newBytes(size_t length, uint8_t * source);
extern void krk_bytesUpdateHash(KrkBytes * bytes);
extern size_t krk_codepointToBytes(krk_integer_type value, unsigned char * out);

22
rline.c
View File

@ -498,6 +498,14 @@ int c_keyword_qualifier(int c) {
return isalnum(c) || (c == '_');
}
void paintNHex(struct syntax_state * state, int n) {
paint(2, FLAG_ESCAPE);
/* Why is my FLAG_ERROR not valid in rline? */
for (int i = 0; i < n; ++i) {
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
}
}
void paint_krk_string(struct syntax_state * state, int type) {
/* Assumes you came in from a check of charat() == '"' */
paint(1, FLAG_STRING);
@ -509,17 +517,11 @@ void paint_krk_string(struct syntax_state * state, int type) {
return;
} else if (charat() == '\\') {
if (nextchar() == 'x') {
paint(2, FLAG_ESCAPE);
/* Why is my FLAG_ERROR not valid in rline? */
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paintNHex(state, 2);
} else if (nextchar() == 'u') {
paint(2, FLAG_ESCAPE);
/* Why is my FLAG_ERROR not valid in rline? */
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
paintNHex(state, 4);
} else if (nextchar() == 'U') {
paintNHex(state, 8);
} else {
paint(2, FLAG_ESCAPE);
}

23
vm.c
View File

@ -1595,26 +1595,9 @@ static KrkValue _int_to_floating(int argc, KrkValue argv[]) {
/* int.__chr__() */
static KrkValue _int_to_char(int argc, KrkValue argv[]) {
krk_integer_type value = AS_INTEGER(argv[0]);
unsigned char out[5] = {0};
if (value > 0xFFFF) {
out[0] = (0xF0 | (value >> 18));
out[1] = (0x80 | ((value >> 12) & 0x3F));
out[2] = (0x80 | ((value >> 6) & 0x3F));
out[3] = (0x80 | ((value) & 0x3F));
return OBJECT_VAL(krk_copyString((char*)out,4));
} else if (value > 0x7FF) {
out[0] = (0xE0 | (value >> 12));
out[1] = (0x80 | ((value >> 6) & 0x3F));
out[2] = (0x80 | (value & 0x3F));
return OBJECT_VAL(krk_copyString((char*)out,3));
} else if (value > 0x7F) {
out[0] = (0xC0 | (value >> 6));
out[1] = (0x80 | (value & 0x3F));
return OBJECT_VAL(krk_copyString((char*)out,2));
} else {
out[0] = (unsigned char)value;
return OBJECT_VAL(krk_copyString((char*)out,1));
}
unsigned char bytes[5] = {0};
size_t len = krk_codepointToBytes(value, bytes);
return OBJECT_VAL(krk_copyString((char*)bytes,len));
}
/* str.__ord__() */