mirror of
https://github.com/rui314/chibicc
synced 2024-11-22 14:21:18 +03:00
Allow multibyte UTF-8 character in identifier
This commit is contained in:
parent
e4491b8115
commit
0e5d250ebf
@ -362,6 +362,8 @@ int align_to(int n, int align);
|
||||
|
||||
int encode_utf8(char *buf, uint32_t c);
|
||||
uint32_t decode_utf8(char **new_pos, char *p);
|
||||
bool is_ident1(uint32_t c);
|
||||
bool is_ident2(uint32_t c);
|
||||
|
||||
//
|
||||
// main.c
|
||||
|
@ -6,6 +6,8 @@ typedef unsigned short char16_t;
|
||||
typedef unsigned int char32_t;
|
||||
typedef int wchar_t;
|
||||
|
||||
int π = 3;
|
||||
|
||||
int main() {
|
||||
ASSERT(4, sizeof(L'\0'));
|
||||
ASSERT(97, L'a');
|
||||
@ -93,6 +95,9 @@ int main() {
|
||||
ASSERT(L'x', ({ wchar_t x[] = L"🤔x"; x[1]; }));
|
||||
ASSERT(12, ({ wchar_t x[] = L"🤔x"; sizeof(x); }));
|
||||
|
||||
ASSERT(3, π);
|
||||
ASSERT(3, ({ int あβ0¾=3; あβ0¾; }));
|
||||
|
||||
printf("OK\n");
|
||||
return 0;
|
||||
}
|
||||
|
31
tokenize.c
31
tokenize.c
@ -114,14 +114,21 @@ static bool startswith(char *p, char *q) {
|
||||
return strncmp(p, q, strlen(q)) == 0;
|
||||
}
|
||||
|
||||
// Returns true if c is valid as the first character of an identifier.
|
||||
static bool is_ident1(char c) {
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
|
||||
}
|
||||
// Read an identifier and returns the length of it.
|
||||
// If p does not point to a valid identifier, 0 is returned.
|
||||
static int read_ident(char *start) {
|
||||
char *p = start;
|
||||
uint32_t c = decode_utf8(&p, p);
|
||||
if (!is_ident1(c))
|
||||
return 0;
|
||||
|
||||
// Returns true if c is valid as a non-first character of an identifier.
|
||||
static bool is_ident2(char c) {
|
||||
return is_ident1(c) || ('0' <= c && c <= '9');
|
||||
for (;;) {
|
||||
char *q;
|
||||
c = decode_utf8(&q, p);
|
||||
if (!is_ident2(c))
|
||||
return p - start;
|
||||
p = q;
|
||||
}
|
||||
}
|
||||
|
||||
static int from_hex(char c) {
|
||||
@ -589,12 +596,10 @@ Token *tokenize(File *file) {
|
||||
}
|
||||
|
||||
// Identifier or keyword
|
||||
if (is_ident1(*p)) {
|
||||
char *start = p;
|
||||
do {
|
||||
p++;
|
||||
} while (is_ident2(*p));
|
||||
cur = cur->next = new_token(TK_IDENT, start, p);
|
||||
int ident_len = read_ident(p);
|
||||
if (ident_len) {
|
||||
cur = cur->next = new_token(TK_IDENT, p, p + ident_len);
|
||||
p += cur->len;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
49
unicode.c
49
unicode.c
@ -66,3 +66,52 @@ uint32_t decode_utf8(char **new_pos, char *p) {
|
||||
*new_pos = p + len;
|
||||
return c;
|
||||
}
|
||||
|
||||
static bool in_range(uint32_t *range, uint32_t c) {
|
||||
for (int i = 0; range[i] != -1; i += 2)
|
||||
if (range[i] <= c && c <= range[i + 1])
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// [https://www.sigbus.info/n1570#D] C11 allows not only ASCII but
|
||||
// some multibyte characters in certan Unicode ranges to be used in an
|
||||
// identifier.
|
||||
//
|
||||
// This function returns true if a given character is acceptable as
|
||||
// the first character of an identifier.
|
||||
//
|
||||
// For example, ¾ (U+00BE) is a valid identifier because characters in
|
||||
// 0x00BE-0x00C0 are allowed, while neither ⟘ (U+27D8) nor ' '
|
||||
// (U+3000, full-width space) are allowed because they are out of range.
|
||||
bool is_ident1(uint32_t c) {
|
||||
static uint32_t range[] = {
|
||||
'_', '_', 'a', 'z', 'A', 'Z',
|
||||
0x00A8, 0x00A8, 0x00AA, 0x00AA, 0x00AD, 0x00AD, 0x00AF, 0x00AF,
|
||||
0x00B2, 0x00B5, 0x00B7, 0x00BA, 0x00BC, 0x00BE, 0x00C0, 0x00D6,
|
||||
0x00D8, 0x00F6, 0x00F8, 0x00FF, 0x0100, 0x02FF, 0x0370, 0x167F,
|
||||
0x1681, 0x180D, 0x180F, 0x1DBF, 0x1E00, 0x1FFF, 0x200B, 0x200D,
|
||||
0x202A, 0x202E, 0x203F, 0x2040, 0x2054, 0x2054, 0x2060, 0x206F,
|
||||
0x2070, 0x20CF, 0x2100, 0x218F, 0x2460, 0x24FF, 0x2776, 0x2793,
|
||||
0x2C00, 0x2DFF, 0x2E80, 0x2FFF, 0x3004, 0x3007, 0x3021, 0x302F,
|
||||
0x3031, 0x303F, 0x3040, 0xD7FF, 0xF900, 0xFD3D, 0xFD40, 0xFDCF,
|
||||
0xFDF0, 0xFE1F, 0xFE30, 0xFE44, 0xFE47, 0xFFFD,
|
||||
0x10000, 0x1FFFD, 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, 0x40000, 0x4FFFD,
|
||||
0x50000, 0x5FFFD, 0x60000, 0x6FFFD, 0x70000, 0x7FFFD, 0x80000, 0x8FFFD,
|
||||
0x90000, 0x9FFFD, 0xA0000, 0xAFFFD, 0xB0000, 0xBFFFD, 0xC0000, 0xCFFFD,
|
||||
0xD0000, 0xDFFFD, 0xE0000, 0xEFFFD, -1,
|
||||
};
|
||||
|
||||
return in_range(range, c);
|
||||
}
|
||||
|
||||
// Returns true if a given character is acceptable as a non-first
|
||||
// character of an identifier.
|
||||
bool is_ident2(uint32_t c) {
|
||||
static uint32_t range[] = {
|
||||
'0', '9', 0x0300, 0x036F, 0x1DC0, 0x1DFF, 0x20D0, 0x20FF,
|
||||
0xFE20, 0xFE2F, -1,
|
||||
};
|
||||
|
||||
return is_ident1(c) || in_range(range, c);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user