Allow multibyte UTF-8 character in identifier

This commit is contained in:
Rui Ueyama 2020-05-04 16:53:48 +09:00
parent e4491b8115
commit 0e5d250ebf
4 changed files with 74 additions and 13 deletions

View File

@ -362,6 +362,8 @@ int align_to(int n, int align);
int encode_utf8(char *buf, uint32_t c);
uint32_t decode_utf8(char **new_pos, char *p);
bool is_ident1(uint32_t c);
bool is_ident2(uint32_t c);
//
// main.c

View File

@ -6,6 +6,8 @@ typedef unsigned short char16_t;
typedef unsigned int char32_t;
typedef int wchar_t;
int π = 3;
int main() {
ASSERT(4, sizeof(L'\0'));
ASSERT(97, L'a');
@ -93,6 +95,9 @@ int main() {
ASSERT(L'x', ({ wchar_t x[] = L"🤔x"; x[1]; }));
ASSERT(12, ({ wchar_t x[] = L"🤔x"; sizeof(x); }));
ASSERT(3, π);
ASSERT(3, ({ int β0¾=3; β0¾; }));
printf("OK\n");
return 0;
}

View File

@ -114,14 +114,21 @@ static bool startswith(char *p, char *q) {
return strncmp(p, q, strlen(q)) == 0;
}
// Returns true if c is valid as the first character of an identifier.
static bool is_ident1(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
}
// Read an identifier and returns the length of it.
// If p does not point to a valid identifier, 0 is returned.
static int read_ident(char *start) {
char *p = start;
uint32_t c = decode_utf8(&p, p);
if (!is_ident1(c))
return 0;
// Returns true if c is valid as a non-first character of an identifier.
static bool is_ident2(char c) {
return is_ident1(c) || ('0' <= c && c <= '9');
for (;;) {
char *q;
c = decode_utf8(&q, p);
if (!is_ident2(c))
return p - start;
p = q;
}
}
static int from_hex(char c) {
@ -589,12 +596,10 @@ Token *tokenize(File *file) {
}
// Identifier or keyword
if (is_ident1(*p)) {
char *start = p;
do {
p++;
} while (is_ident2(*p));
cur = cur->next = new_token(TK_IDENT, start, p);
int ident_len = read_ident(p);
if (ident_len) {
cur = cur->next = new_token(TK_IDENT, p, p + ident_len);
p += cur->len;
continue;
}

View File

@ -66,3 +66,52 @@ uint32_t decode_utf8(char **new_pos, char *p) {
*new_pos = p + len;
return c;
}
static bool in_range(uint32_t *range, uint32_t c) {
for (int i = 0; range[i] != -1; i += 2)
if (range[i] <= c && c <= range[i + 1])
return true;
return false;
}
// [https://www.sigbus.info/n1570#D] C11 allows not only ASCII but
// some multibyte characters in certan Unicode ranges to be used in an
// identifier.
//
// This function returns true if a given character is acceptable as
// the first character of an identifier.
//
// For example, ¾ (U+00BE) is a valid identifier because characters in
// 0x00BE-0x00C0 are allowed, while neither ⟘ (U+27D8) nor ' '
// (U+3000, full-width space) are allowed because they are out of range.
bool is_ident1(uint32_t c) {
static uint32_t range[] = {
'_', '_', 'a', 'z', 'A', 'Z',
0x00A8, 0x00A8, 0x00AA, 0x00AA, 0x00AD, 0x00AD, 0x00AF, 0x00AF,
0x00B2, 0x00B5, 0x00B7, 0x00BA, 0x00BC, 0x00BE, 0x00C0, 0x00D6,
0x00D8, 0x00F6, 0x00F8, 0x00FF, 0x0100, 0x02FF, 0x0370, 0x167F,
0x1681, 0x180D, 0x180F, 0x1DBF, 0x1E00, 0x1FFF, 0x200B, 0x200D,
0x202A, 0x202E, 0x203F, 0x2040, 0x2054, 0x2054, 0x2060, 0x206F,
0x2070, 0x20CF, 0x2100, 0x218F, 0x2460, 0x24FF, 0x2776, 0x2793,
0x2C00, 0x2DFF, 0x2E80, 0x2FFF, 0x3004, 0x3007, 0x3021, 0x302F,
0x3031, 0x303F, 0x3040, 0xD7FF, 0xF900, 0xFD3D, 0xFD40, 0xFDCF,
0xFDF0, 0xFE1F, 0xFE30, 0xFE44, 0xFE47, 0xFFFD,
0x10000, 0x1FFFD, 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, 0x40000, 0x4FFFD,
0x50000, 0x5FFFD, 0x60000, 0x6FFFD, 0x70000, 0x7FFFD, 0x80000, 0x8FFFD,
0x90000, 0x9FFFD, 0xA0000, 0xAFFFD, 0xB0000, 0xBFFFD, 0xC0000, 0xCFFFD,
0xD0000, 0xDFFFD, 0xE0000, 0xEFFFD, -1,
};
return in_range(range, c);
}
// Returns true if a given character is acceptable as a non-first
// character of an identifier.
bool is_ident2(uint32_t c) {
static uint32_t range[] = {
'0', '9', 0x0300, 0x036F, 0x1DC0, 0x1DFF, 0x20D0, 0x20FF,
0xFE20, 0xFE2F, -1,
};
return is_ident1(c) || in_range(range, c);
}