Add support for \U escape
This commit is contained in:
parent
dd22d86abe
commit
87c99d5c8f
67
compiler.c
67
compiler.c
@ -1688,6 +1688,28 @@ static void string(int type) {
|
||||
stringBytes = GROW_ARRAY(char, stringBytes, old, stringCapacity); \
|
||||
} stringBytes[stringLength++] = c; } while (0)
|
||||
|
||||
#define PUSH_HEX(n, type) do { \
|
||||
char tmpbuf[10] = {0}; \
|
||||
for (size_t i = 0; i < n; ++i) { \
|
||||
if (c + i + 2 == end || !isHex(c[i+2])) { \
|
||||
error("truncated \\%c escape", type); \
|
||||
return; \
|
||||
} \
|
||||
tmpbuf[i] = c[i+2]; \
|
||||
} \
|
||||
unsigned long value = strtoul(tmpbuf, NULL, 16); \
|
||||
if (value >= 0x110000) { \
|
||||
error("invalid codepoint in \\%c escape", type); \
|
||||
} \
|
||||
if (isBytes) { \
|
||||
PUSH_CHAR(value); \
|
||||
break; \
|
||||
} \
|
||||
unsigned char bytes[5] = {0}; \
|
||||
size_t len = krk_codepointToBytes(value, bytes); \
|
||||
for (size_t i = 0; i < len; i++) PUSH_CHAR(bytes[i]); \
|
||||
} while (0)
|
||||
|
||||
int isBytes = (parser.previous.type == TOKEN_PREFIX_B);
|
||||
if (isBytes && !(match(TOKEN_STRING) || match(TOKEN_BIG_STRING))) {
|
||||
error("Expected string after 'b' prefix?");
|
||||
@ -1713,19 +1735,7 @@ static void string(int type) {
|
||||
case 'v': PUSH_CHAR('\v'); break;
|
||||
case '[': PUSH_CHAR('\033'); break;
|
||||
case 'x': {
|
||||
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3])) {
|
||||
error("invalid \\x escape");
|
||||
return;
|
||||
}
|
||||
unsigned long value = strtoul((char[]){c[2],c[3],'\0'}, NULL, 16);
|
||||
if (isBytes) {
|
||||
PUSH_CHAR(value);
|
||||
} else if (value > 127) {
|
||||
PUSH_CHAR((0xC0 | (value >> 6)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else {
|
||||
PUSH_CHAR(value);
|
||||
}
|
||||
PUSH_HEX(2,'x');
|
||||
c += 2;
|
||||
} break;
|
||||
case 'u': {
|
||||
@ -1733,30 +1743,19 @@ static void string(int type) {
|
||||
PUSH_CHAR(c[0]);
|
||||
PUSH_CHAR(c[1]);
|
||||
} else {
|
||||
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3]) ||
|
||||
c+4 == end || c+5 == end || !isHex(c[4]) || !isHex(c[5])) {
|
||||
error("truncated \\u escape");
|
||||
return;
|
||||
}
|
||||
unsigned long value = strtoul((char[]){c[2],c[3],c[4],c[5],'\0'}, NULL, 16);
|
||||
if (value > 0xFFFF) {
|
||||
PUSH_CHAR((0xF0 | (value >> 18)));
|
||||
PUSH_CHAR((0x80 | ((value >> 12) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | ((value) & 0x3F)));
|
||||
} else if (value > 0x7FF) {
|
||||
PUSH_CHAR((0xE0 | (value >> 12)));
|
||||
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else if (value > 0x7F) {
|
||||
PUSH_CHAR((0xC0 | (value >> 6)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else {
|
||||
PUSH_CHAR(value);
|
||||
}
|
||||
PUSH_HEX(4,'u');
|
||||
c += 4;
|
||||
}
|
||||
} break;
|
||||
case 'U': {
|
||||
if (isBytes) {
|
||||
PUSH_CHAR(c[0]);
|
||||
PUSH_CHAR(c[1]);
|
||||
} else {
|
||||
PUSH_HEX(8,'U');
|
||||
c += 8;
|
||||
}
|
||||
} break;
|
||||
case '\n': break;
|
||||
default:
|
||||
/* TODO octal */
|
||||
|
@ -175,6 +175,8 @@ class KurokoHighlighter(State):
|
||||
self.paint(4, FLAG_ESCAPE)
|
||||
else if self.nextchar() == 'u':
|
||||
self.paint(6, FLAG_ESCAPE)
|
||||
else if self.nextchar() == 'U':
|
||||
self.paint(10, FLAG_ESCAPE)
|
||||
else if self.nextchar() == None:
|
||||
self.paint(1, FLAG_ESCAPE)
|
||||
return int(strType == "'") + 3
|
||||
|
22
object.c
22
object.c
@ -20,6 +20,28 @@ static KrkObj * allocateObject(size_t size, ObjType type) {
|
||||
return object;
|
||||
}
|
||||
|
||||
size_t krk_codepointToBytes(krk_integer_type value, unsigned char * out) {
|
||||
if (value > 0xFFFF) {
|
||||
out[0] = (0xF0 | (value >> 18));
|
||||
out[1] = (0x80 | ((value >> 12) & 0x3F));
|
||||
out[2] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[3] = (0x80 | ((value) & 0x3F));
|
||||
return 4;
|
||||
} else if (value > 0x7FF) {
|
||||
out[0] = (0xE0 | (value >> 12));
|
||||
out[1] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[2] = (0x80 | (value & 0x3F));
|
||||
return 3;
|
||||
} else if (value > 0x7F) {
|
||||
out[0] = (0xC0 | (value >> 6));
|
||||
out[1] = (0x80 | (value & 0x3F));
|
||||
return 2;
|
||||
} else {
|
||||
out[0] = (unsigned char)value;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
#define UTF8_ACCEPT 0
|
||||
#define UTF8_REJECT 1
|
||||
|
||||
|
1
object.h
1
object.h
@ -202,3 +202,4 @@ extern uint32_t krk_unicodeCodepoint(KrkString * string, size_t index);
|
||||
|
||||
extern KrkBytes * krk_newBytes(size_t length, uint8_t * source);
|
||||
extern void krk_bytesUpdateHash(KrkBytes * bytes);
|
||||
extern size_t krk_codepointToBytes(krk_integer_type value, unsigned char * out);
|
||||
|
22
rline.c
22
rline.c
@ -498,6 +498,14 @@ int c_keyword_qualifier(int c) {
|
||||
return isalnum(c) || (c == '_');
|
||||
}
|
||||
|
||||
void paintNHex(struct syntax_state * state, int n) {
|
||||
paint(2, FLAG_ESCAPE);
|
||||
/* Why is my FLAG_ERROR not valid in rline? */
|
||||
for (int i = 0; i < n; ++i) {
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
}
|
||||
}
|
||||
|
||||
void paint_krk_string(struct syntax_state * state, int type) {
|
||||
/* Assumes you came in from a check of charat() == '"' */
|
||||
paint(1, FLAG_STRING);
|
||||
@ -509,17 +517,11 @@ void paint_krk_string(struct syntax_state * state, int type) {
|
||||
return;
|
||||
} else if (charat() == '\\') {
|
||||
if (nextchar() == 'x') {
|
||||
paint(2, FLAG_ESCAPE);
|
||||
/* Why is my FLAG_ERROR not valid in rline? */
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paintNHex(state, 2);
|
||||
} else if (nextchar() == 'u') {
|
||||
paint(2, FLAG_ESCAPE);
|
||||
/* Why is my FLAG_ERROR not valid in rline? */
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paint(1, isxdigit(charat()) ? FLAG_ESCAPE : FLAG_DIFFMINUS);
|
||||
paintNHex(state, 4);
|
||||
} else if (nextchar() == 'U') {
|
||||
paintNHex(state, 8);
|
||||
} else {
|
||||
paint(2, FLAG_ESCAPE);
|
||||
}
|
||||
|
23
vm.c
23
vm.c
@ -1595,26 +1595,9 @@ static KrkValue _int_to_floating(int argc, KrkValue argv[]) {
|
||||
/* int.__chr__() */
|
||||
static KrkValue _int_to_char(int argc, KrkValue argv[]) {
|
||||
krk_integer_type value = AS_INTEGER(argv[0]);
|
||||
unsigned char out[5] = {0};
|
||||
if (value > 0xFFFF) {
|
||||
out[0] = (0xF0 | (value >> 18));
|
||||
out[1] = (0x80 | ((value >> 12) & 0x3F));
|
||||
out[2] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[3] = (0x80 | ((value) & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,4));
|
||||
} else if (value > 0x7FF) {
|
||||
out[0] = (0xE0 | (value >> 12));
|
||||
out[1] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[2] = (0x80 | (value & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,3));
|
||||
} else if (value > 0x7F) {
|
||||
out[0] = (0xC0 | (value >> 6));
|
||||
out[1] = (0x80 | (value & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,2));
|
||||
} else {
|
||||
out[0] = (unsigned char)value;
|
||||
return OBJECT_VAL(krk_copyString((char*)out,1));
|
||||
}
|
||||
unsigned char bytes[5] = {0};
|
||||
size_t len = krk_codepointToBytes(value, bytes);
|
||||
return OBJECT_VAL(krk_copyString((char*)bytes,len));
|
||||
}
|
||||
|
||||
/* str.__ord__() */
|
||||
|
Loading…
Reference in New Issue
Block a user