Skip UTF-8 BOM markers

This commit is contained in:
Rui Ueyama 2020-10-15 14:21:45 +09:00
parent 238277714d
commit 2b2fa25507
2 changed files with 11 additions and 0 deletions

View File

@ -109,4 +109,8 @@ $chibicc -c -O -Wall -g -std=c11 -ffreestanding -fno-builtin \
-m64 -mno-red-zone -w -o /dev/null $tmp/empty.c
check 'ignored options'
# BOM marker
printf '\xef\xbb\xbfxyz\n' | $chibicc -E -o- - | grep -q '^xyz'
check 'BOM marker'
echo OK

View File

@ -772,6 +772,13 @@ Token *tokenize_file(char *path) {
if (!p)
return NULL;
// UTF-8 texts may start with a 3-byte "BOM" marker sequence.
// If exists, just skip them because they are useless bytes.
// (It is actually not recommended to add BOM markers to UTF-8
// texts, but it's not uncommon particularly on Windows.)
if (!memcmp(p, "\xef\xbb\xbf", 3))
p += 3;
canonicalize_newline(p);
remove_backslash_newline(p);
convert_universal_chars(p);