Major overhaul of strings to support Unicode
This commit is contained in:
parent
76b0b50154
commit
10f3c16e40
60
README.md
60
README.md
@ -136,12 +136,16 @@ Strings can be defined with single, double, or Python-style _triple quotes_, the
|
||||
|
||||
The following escape sequences can be embedded in string literals:
|
||||
|
||||
- `\a`: bell
|
||||
- `\b`: backspace
|
||||
- `\f`: formfeed
|
||||
- `\n`: linefeed
|
||||
- `\r`: carriage return
|
||||
- `\t`: horizontal tab
|
||||
- `\v`: vertical tab
|
||||
- `\[`: ANSI escape value (decimal value 27)
|
||||
|
||||
A backslash followed by another character, such as the quoting character used to define the string or another backslash character, will be taking literally.
|
||||
- `\x`: A two-character hexadecimal sequence
|
||||
- `\u`: A four-character hexadecimal sequence
|
||||
|
||||
Strings in Kuroko are immutable; they can not be modified in-place.
|
||||
|
||||
@ -152,7 +156,55 @@ print("Hello, " + 42 + "!")
|
||||
# → Hello, 42!
|
||||
```
|
||||
|
||||
_**Note:** Strings in Kuroko are byte strings, as they were in Python 2. On all platforms currently supported by Kuroko, this means that strings should contain UTF-8 text. It is as-yet undecided if this will change._
|
||||
Much like in Python 3, strings in Kuroko represent sequences of non-normalized Unicode codepoints. Both source files and the terminal in which Kuroko is running are expected to be UTF-8.
|
||||
|
||||
This means that when indexing into a Unicode string, individual codepoints should be expected:
|
||||
|
||||
```py
|
||||
print("日本語"[1])
|
||||
# → 本
|
||||
print("日本語"[2])
|
||||
# → 語
|
||||
```
|
||||
|
||||
The length of a Unicode string is also represented in codepoints:
|
||||
|
||||
```py
|
||||
print(len("日本語"))
|
||||
# → 3
|
||||
```
|
||||
|
||||
The `__ord__` method on a string representing a single codepoint will return the integer representation of that codepoint:
|
||||
|
||||
```py
|
||||
print("本".__ord__())
|
||||
# → 26412
|
||||
print("t".__ord__())
|
||||
# → 116
|
||||
```
|
||||
|
||||
_**Implementation Note:** Generally, the internal representation of strings is their UTF-8 encoded form. When an indexing or slicing operation happens in which a codepoint index needs to be converted to an offset in the string, the most appropriate 'canonical' format will be generated and remain with the interned string until is garbage collected. For strings containing only ASCII characters, no conversion is done and no additional copy is created. For all other strings, the smallest possible size for representing the largest codepoint is used, among the options of 1, 2, or 4. This approach is similar to CPython post-3.9._
|
||||
|
||||
Strings can be encoded to _bytes_ objects to get their UTF-8 representation:
|
||||
|
||||
```py
|
||||
print('テスト'.encode())
|
||||
# → b'\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88'
|
||||
```
|
||||
|
||||
Bytes objects can also be written as literals in the same format. Note that strings and bytes are not generally compatible with each other.
|
||||
|
||||
```py
|
||||
print(b'test')
|
||||
# → b'test'
|
||||
```
|
||||
|
||||
When indexing into a bytes object, values are returned as positive integers.
|
||||
|
||||
```py
|
||||
print(b'test'[0])
|
||||
# → 116
|
||||
```
|
||||
|
||||
### Variables
|
||||
|
||||
@ -175,6 +227,8 @@ print(a,b,c)
|
||||
# → 1 test <instance of object at ...>
|
||||
```
|
||||
|
||||
_**Note:** Identifier names, including for variables, functions, and classes, can be Unicode sequences. All non-ASCII codepoints are accepted as identifier characters._
|
||||
|
||||
### Assignments
|
||||
|
||||
After a variable is declared, assignments to it are valid as both expressions and statements. Kuroko provides assignment shortcuts like `+=` and `-=` as well as C-style postfix increment (`++`) and decrement (`--`).
|
||||
|
82
compiler.c
82
compiler.c
@ -1521,6 +1521,11 @@ static void string(int type) {
|
||||
stringBytes = GROW_ARRAY(char, stringBytes, old, stringCapacity); \
|
||||
} stringBytes[stringLength++] = c; } while (0)
|
||||
|
||||
int isBytes = (parser.previous.type == TOKEN_PREFIX_B);
|
||||
if (isBytes && !(match(TOKEN_STRING) || match(TOKEN_BIG_STRING))) {
|
||||
error("Expected string after 'b' prefix?");
|
||||
}
|
||||
|
||||
/* This should capture everything but the quotes. */
|
||||
do {
|
||||
int type = parser.previous.type == TOKEN_BIG_STRING ? 3 : 1;
|
||||
@ -1529,28 +1534,94 @@ static void string(int type) {
|
||||
while (c < end) {
|
||||
if (*c == '\\') {
|
||||
switch (c[1]) {
|
||||
case '\\': PUSH_CHAR('\\'); break;
|
||||
case '\'': PUSH_CHAR('\''); break;
|
||||
case '\"': PUSH_CHAR('\"'); break;
|
||||
case 'a': PUSH_CHAR('\a'); break;
|
||||
case 'b': PUSH_CHAR('\b'); break;
|
||||
case 'f': PUSH_CHAR('\f'); break;
|
||||
case 'n': PUSH_CHAR('\n'); break;
|
||||
case 'r': PUSH_CHAR('\r'); break;
|
||||
case 't': PUSH_CHAR('\t'); break;
|
||||
case 'v': PUSH_CHAR('\v'); break;
|
||||
case '[': PUSH_CHAR('\033'); break;
|
||||
case 'x':
|
||||
case 'x': {
|
||||
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3])) {
|
||||
error("invalid \\x escape");
|
||||
return;
|
||||
}
|
||||
PUSH_CHAR(strtoul((char[]){c[2],c[3],'\0'}, NULL, 16));
|
||||
unsigned long value = strtoul((char[]){c[2],c[3],'\0'}, NULL, 16);
|
||||
if (isBytes) {
|
||||
PUSH_CHAR(value);
|
||||
} else if (value > 127) {
|
||||
PUSH_CHAR((0xC0 | (value >> 6)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else {
|
||||
PUSH_CHAR(value);
|
||||
}
|
||||
c += 2;
|
||||
break;
|
||||
} break;
|
||||
case 'u': {
|
||||
if (isBytes) {
|
||||
PUSH_CHAR(c[0]);
|
||||
PUSH_CHAR(c[1]);
|
||||
} else {
|
||||
if (c+2 == end || c+3 == end || !isHex(c[2]) || !isHex(c[3]) ||
|
||||
c+4 == end || c+5 == end || !isHex(c[4]) || !isHex(c[5])) {
|
||||
error("truncated \\u escape");
|
||||
return;
|
||||
}
|
||||
unsigned long value = strtoul((char[]){c[2],c[3],c[4],c[5],'\0'}, NULL, 16);
|
||||
if (value > 0xFFFF) {
|
||||
PUSH_CHAR((0xF0 | (value >> 18)));
|
||||
PUSH_CHAR((0x80 | ((value >> 12) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | ((value) & 0x3F)));
|
||||
} else if (value > 0x7FF) {
|
||||
PUSH_CHAR((0xE0 | (value >> 12)));
|
||||
PUSH_CHAR((0x80 | ((value >> 6) & 0x3F)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else if (value > 0x7F) {
|
||||
PUSH_CHAR((0xC0 | (value >> 6)));
|
||||
PUSH_CHAR((0x80 | (value & 0x3F)));
|
||||
} else {
|
||||
PUSH_CHAR(value);
|
||||
}
|
||||
c += 4;
|
||||
}
|
||||
} break;
|
||||
case '\n': break;
|
||||
default: PUSH_CHAR(c[1]); break;
|
||||
default:
|
||||
/* TODO octal */
|
||||
PUSH_CHAR(c[0]);
|
||||
PUSH_CHAR(c[1]);
|
||||
break;
|
||||
}
|
||||
c += 2;
|
||||
} else {
|
||||
if (*(unsigned char*)c > 127 && isBytes) {
|
||||
FREE_ARRAY(char,stringBytes,stringCapacity);
|
||||
error("bytes literal can only contain ASCII characters");
|
||||
return;
|
||||
}
|
||||
PUSH_CHAR(*c);
|
||||
c++;
|
||||
}
|
||||
}
|
||||
} while (match(TOKEN_STRING) || match(TOKEN_BIG_STRING));
|
||||
} while ((!isBytes || match(TOKEN_PREFIX_B)) && (match(TOKEN_STRING) || match(TOKEN_BIG_STRING)));
|
||||
if (isBytes && (match(TOKEN_STRING) || match(TOKEN_BIG_STRING))) {
|
||||
FREE_ARRAY(char,stringBytes,stringCapacity);
|
||||
error("can not mix bytes and string literals");
|
||||
return;
|
||||
}
|
||||
if (isBytes) {
|
||||
KrkBytes * bytes = krk_newBytes(0,NULL);
|
||||
bytes->bytes = (uint8_t*)stringBytes;
|
||||
bytes->length = stringLength;
|
||||
krk_bytesUpdateHash(bytes);
|
||||
emitConstant(OBJECT_VAL(bytes));
|
||||
return;
|
||||
}
|
||||
emitConstant(OBJECT_VAL(krk_copyString(stringBytes,stringLength)));
|
||||
FREE_ARRAY(char,stringBytes,stringCapacity);
|
||||
#undef PUSH_CHAR
|
||||
@ -1870,6 +1941,7 @@ ParseRule krk_parseRules[] = {
|
||||
RULE(TOKEN_IDENTIFIER, variable, NULL, PREC_NONE),
|
||||
RULE(TOKEN_STRING, string, NULL, PREC_NONE),
|
||||
RULE(TOKEN_BIG_STRING, string, NULL, PREC_NONE),
|
||||
RULE(TOKEN_PREFIX_B, string, NULL, PREC_NONE),
|
||||
RULE(TOKEN_NUMBER, number, NULL, PREC_NONE),
|
||||
RULE(TOKEN_AND, NULL, and_, PREC_AND),
|
||||
RULE(TOKEN_CLASS, NULL, NULL, PREC_NONE),
|
||||
|
8
memory.c
8
memory.c
@ -31,6 +31,7 @@ static void freeObject(KrkObj * object) {
|
||||
case OBJ_STRING: {
|
||||
KrkString * string = (KrkString*)object;
|
||||
FREE_ARRAY(char, string->chars, string->length + 1);
|
||||
if (string->codes && string->codes != string->chars) free(string->codes);
|
||||
FREE(KrkString, object);
|
||||
break;
|
||||
}
|
||||
@ -79,6 +80,12 @@ static void freeObject(KrkObj * object) {
|
||||
FREE(KrkTuple, object);
|
||||
break;
|
||||
}
|
||||
case OBJ_BYTES: {
|
||||
KrkBytes * bytes = (KrkBytes*)object;
|
||||
FREE_ARRAY(uint8_t, bytes->bytes, bytes->length);
|
||||
FREE(KrkBytes, bytes);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -171,6 +178,7 @@ static void blackenObject(KrkObj * object) {
|
||||
}
|
||||
case OBJ_NATIVE:
|
||||
case OBJ_STRING:
|
||||
case OBJ_BYTES:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
131
object.c
131
object.c
@ -20,11 +20,122 @@ static KrkObj * allocateObject(size_t size, ObjType type) {
|
||||
return object;
|
||||
}
|
||||
|
||||
#define UTF8_ACCEPT 0
|
||||
#define UTF8_REJECT 1
|
||||
|
||||
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||
static int state_table[32] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xxxxxxx */
|
||||
1,1,1,1,1,1,1,1, /* 10xxxxxx */
|
||||
2,2,2,2, /* 110xxxxx */
|
||||
3,3, /* 1110xxxx */
|
||||
4, /* 11110xxx */
|
||||
1 /* 11111xxx */
|
||||
};
|
||||
|
||||
static int mask_bytes[32] = {
|
||||
0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,
|
||||
0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x1F,0x1F,0x1F,0x1F,
|
||||
0x0F,0x0F,
|
||||
0x07,
|
||||
0x00
|
||||
};
|
||||
|
||||
static int next[5] = {
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
3
|
||||
};
|
||||
|
||||
if (*state == UTF8_ACCEPT) {
|
||||
*codep = byte & mask_bytes[byte >> 3];
|
||||
*state = state_table[byte >> 3];
|
||||
} else if (*state > 0) {
|
||||
*codep = (byte & 0x3F) | (*codep << 6);
|
||||
*state = next[*state];
|
||||
}
|
||||
return *state;
|
||||
}
|
||||
|
||||
static int checkString(const char * chars, size_t length, size_t *codepointCount) {
|
||||
uint32_t state = 0;
|
||||
uint32_t codepoint = 0;
|
||||
unsigned char * end = (unsigned char *)chars + length;
|
||||
uint32_t maxCodepoint = 0;
|
||||
for (unsigned char * c = (unsigned char *)chars; c < end; ++c) {
|
||||
if (!decode(&state, &codepoint, *c)) {
|
||||
if (codepoint > maxCodepoint) maxCodepoint = codepoint;
|
||||
(*codepointCount)++;
|
||||
} else if (state == UTF8_REJECT) {
|
||||
state = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (maxCodepoint > 0xFFFF) {
|
||||
return KRK_STRING_UCS4;
|
||||
} else if (maxCodepoint > 0xFF) {
|
||||
return KRK_STRING_UCS2;
|
||||
} else if (maxCodepoint > 0x7F) {
|
||||
return KRK_STRING_UCS1;
|
||||
} else {
|
||||
return KRK_STRING_ASCII;
|
||||
}
|
||||
}
|
||||
|
||||
#define GENREADY(size,type) \
|
||||
static void _readyUCS ## size (KrkString * string) { \
|
||||
uint32_t state = 0; \
|
||||
uint32_t codepoint = 0; \
|
||||
unsigned char * end = (unsigned char *)string->chars + string->length; \
|
||||
string->codes = malloc(sizeof(type) * string->codesLength); \
|
||||
type *outPtr = (type *)string->codes; \
|
||||
for (unsigned char * c = (unsigned char *)string->chars; c < end; ++c) { \
|
||||
if (!decode(&state, &codepoint, *c)) { \
|
||||
*(outPtr++) = (type)codepoint; \
|
||||
} else if (state == UTF8_REJECT) { \
|
||||
state = 0; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
GENREADY(1,uint8_t)
|
||||
GENREADY(2,uint16_t)
|
||||
GENREADY(4,uint32_t)
|
||||
#undef GENREADY
|
||||
|
||||
void * krk_unicodeString(KrkString * string) {
|
||||
if (string->codes) return string->codes;
|
||||
if (string->type == KRK_STRING_UCS1) _readyUCS1(string);
|
||||
else if (string->type == KRK_STRING_UCS2) _readyUCS2(string);
|
||||
else if (string->type == KRK_STRING_UCS4) _readyUCS4(string);
|
||||
else krk_runtimeError(vm.exceptions.valueError, "Internal string error.");
|
||||
return string->codes;
|
||||
}
|
||||
|
||||
uint32_t krk_unicodeCodepoint(KrkString * string, size_t index) {
|
||||
krk_unicodeString(string);
|
||||
switch (string->type) {
|
||||
case KRK_STRING_ASCII: return string->chars[index];
|
||||
case KRK_STRING_UCS1: return ((uint8_t*)string->codes)[index];
|
||||
case KRK_STRING_UCS2: return ((uint16_t*)string->codes)[index];
|
||||
case KRK_STRING_UCS4: return ((uint32_t*)string->codes)[index];
|
||||
}
|
||||
krk_runtimeError(vm.exceptions.valueError, "Invalid string.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static KrkString * allocateString(char * chars, size_t length, uint32_t hash) {
|
||||
KrkString * string = ALLOCATE_OBJECT(KrkString, OBJ_STRING);
|
||||
string->length = length;
|
||||
string->chars = chars;
|
||||
string->hash = hash;
|
||||
string->codesLength = 0;
|
||||
string->type = checkString(chars,length,&string->codesLength);
|
||||
string->codes = NULL;
|
||||
if (string->type == KRK_STRING_ASCII) string->codes = string->chars;
|
||||
krk_push(OBJECT_VAL(string));
|
||||
krk_tableSet(&vm.strings, OBJECT_VAL(string), NONE_VAL());
|
||||
krk_pop();
|
||||
@ -156,3 +267,23 @@ KrkTuple * krk_newTuple(size_t length) {
|
||||
krk_pop();
|
||||
return tuple;
|
||||
}
|
||||
|
||||
void krk_bytesUpdateHash(KrkBytes * bytes) {
|
||||
bytes->hash = hashString((char*)bytes->bytes, bytes->length);
|
||||
}
|
||||
|
||||
KrkBytes * krk_newBytes(size_t length, uint8_t * source) {
|
||||
KrkBytes * bytes = ALLOCATE_OBJECT(KrkBytes, OBJ_BYTES);
|
||||
bytes->length = length;
|
||||
bytes->bytes = NULL;
|
||||
krk_push(OBJECT_VAL(bytes));
|
||||
bytes->bytes = ALLOCATE(uint8_t, length);
|
||||
bytes->hash = -1;
|
||||
if (source) {
|
||||
memcpy(bytes->bytes, source, length);
|
||||
krk_bytesUpdateHash(bytes);
|
||||
}
|
||||
krk_pop();
|
||||
return bytes;
|
||||
}
|
||||
|
||||
|
37
object.h
37
object.h
@ -11,6 +11,8 @@
|
||||
#define IS_STRING(value) isObjType(value, OBJ_STRING)
|
||||
#define AS_STRING(value) ((KrkString *)AS_OBJECT(value))
|
||||
#define AS_CSTRING(value) (((KrkString *)AS_OBJECT(value))->chars)
|
||||
#define IS_BYTES(value) isObjType(value, OBJ_BYTES)
|
||||
#define AS_BYTES(value) ((KrkBytes*)AS_OBJECT(value))
|
||||
#define IS_FUNCTION(value) isObjType(value, OBJ_FUNCTION)
|
||||
#define AS_FUNCTION(value) ((KrkFunction *)AS_OBJECT(value))
|
||||
#define IS_NATIVE(value) isObjType(value, OBJ_NATIVE)
|
||||
@ -37,6 +39,7 @@ typedef enum {
|
||||
OBJ_INSTANCE,
|
||||
OBJ_BOUND_METHOD,
|
||||
OBJ_TUPLE,
|
||||
OBJ_BYTES,
|
||||
} ObjType;
|
||||
|
||||
struct Obj {
|
||||
@ -45,13 +48,30 @@ struct Obj {
|
||||
struct Obj * next;
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
KRK_STRING_ASCII = 0,
|
||||
KRK_STRING_UCS1 = 1,
|
||||
KRK_STRING_UCS2 = 2,
|
||||
KRK_STRING_UCS4 = 4,
|
||||
} KrkStringType;
|
||||
|
||||
struct ObjString {
|
||||
KrkObj obj;
|
||||
size_t length;
|
||||
char * chars;
|
||||
KrkStringType type;
|
||||
uint32_t hash;
|
||||
size_t length;
|
||||
size_t codesLength;
|
||||
char * chars;
|
||||
void * codes;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
KrkObj obj;
|
||||
uint32_t hash;
|
||||
size_t length;
|
||||
uint8_t * bytes;
|
||||
} KrkBytes;
|
||||
|
||||
typedef struct KrkUpvalue {
|
||||
KrkObj obj;
|
||||
int location;
|
||||
@ -165,3 +185,16 @@ extern KrkClass * krk_newClass(KrkString * name);
|
||||
extern KrkInstance * krk_newInstance(KrkClass * _class);
|
||||
extern KrkBoundMethod * krk_newBoundMethod(KrkValue receiver, KrkObj * method);
|
||||
extern KrkTuple * krk_newTuple(size_t length);
|
||||
|
||||
extern void * krk_unicodeString(KrkString * string);
|
||||
extern uint32_t krk_unicodeCodepoint(KrkString * string, size_t index);
|
||||
|
||||
#define KRK_STRING_FAST(string,offset) (uint32_t)\
|
||||
(string->type == 1 ? ((uint8_t*)string->codes)[offset] : \
|
||||
(string->type == 2 ? ((uint16_t*)string->codes)[offset] : \
|
||||
((uint32_t*)string->codes)[offset]))
|
||||
|
||||
#define CODEPOINT_BYTES(cp) (cp < 0x80 ? 1 : (cp < 0x800 ? 2 : (cp < 0x10000 ? 3 : 4)))
|
||||
|
||||
extern KrkBytes * krk_newBytes(size_t length, uint8_t * source);
|
||||
extern void krk_bytesUpdateHash(KrkBytes * bytes);
|
||||
|
8
rline.c
8
rline.c
@ -23,6 +23,14 @@
|
||||
#include <sys/ioctl.h>
|
||||
#include "rline.h"
|
||||
|
||||
static __attribute__((used)) int _isdigit(int c) { if (c > 128) return 0; return isdigit(c); }
|
||||
static __attribute__((used)) int _isxdigit(int c) { if (c > 128) return 0; return isxdigit(c); }
|
||||
|
||||
#undef isdigit
|
||||
#undef isxdigit
|
||||
#define isdigit(c) _isdigit(c)
|
||||
#define isxdigit(c) _isxdigit(c)
|
||||
|
||||
char * rline_history[RLINE_HISTORY_ENTRIES];
|
||||
int rline_history_count = 0;
|
||||
int rline_history_offset = 0;
|
||||
|
@ -196,7 +196,9 @@ static KrkTokenType identifierType() {
|
||||
case 'n': return checkKeyword(2, "d", TOKEN_AND);
|
||||
case 's': return checkKeyword(2, "", TOKEN_AS);
|
||||
} break;
|
||||
case 'b': return checkKeyword(1, "reak", TOKEN_BREAK);
|
||||
case 'b': if (MORE(1)) return checkKeyword(1, "reak", TOKEN_BREAK);
|
||||
else if (scanner.start[1] == '\'' || scanner.start[1] == '"') return TOKEN_PREFIX_B;
|
||||
break;
|
||||
case 'c': if (MORE(1)) switch(scanner.start[1]) {
|
||||
case 'l': return checkKeyword(2, "ass", TOKEN_CLASS);
|
||||
case 'o': return checkKeyword(2, "ntinue", TOKEN_CONTINUE);
|
||||
@ -246,7 +248,7 @@ static KrkTokenType identifierType() {
|
||||
}
|
||||
|
||||
static KrkToken identifier() {
|
||||
while (isAlpha(peek()) || isDigit(peek())) advance();
|
||||
while (isAlpha(peek()) || isDigit(peek()) || (unsigned char)peek() > 0x7F) advance();
|
||||
|
||||
return makeToken(identifierType());
|
||||
}
|
||||
@ -309,7 +311,7 @@ KrkToken krk_scanToken() {
|
||||
/* Not indentation, not a linefeed on an empty line, must be not be start of line any more */
|
||||
scanner.startOfLine = 0;
|
||||
|
||||
if (isAlpha(c)) return identifier();
|
||||
if (isAlpha(c) || (unsigned char)c > 0x7F) return identifier();
|
||||
if (isDigit(c)) return number(c);
|
||||
|
||||
switch (c) {
|
||||
@ -343,6 +345,5 @@ KrkToken krk_scanToken() {
|
||||
case '\'': return string('\'');
|
||||
}
|
||||
|
||||
|
||||
return errorToken("Unexpected character.");
|
||||
}
|
||||
|
@ -69,6 +69,8 @@ typedef enum {
|
||||
TOKEN_LAMBDA,
|
||||
TOKEN_WITH,
|
||||
|
||||
TOKEN_PREFIX_B,
|
||||
|
||||
TOKEN_INDENTATION,
|
||||
|
||||
TOKEN_EOL,
|
||||
|
1
table.c
1
table.c
@ -26,6 +26,7 @@ static uint32_t hashValue(KrkValue value) {
|
||||
if (IS_FLOATING(value)) return (uint32_t)(AS_FLOATING(value) * 1000); /* arbitrary; what's a good way to hash floats? */
|
||||
if (IS_BOOLEAN(value)) return (uint32_t)(AS_BOOLEAN(value));
|
||||
if (IS_NONE(value)) return 0;
|
||||
if (IS_BYTES(value)) return (AS_BYTES(value))->hash; /* Same as strings, but we don't have an interning table */
|
||||
return (((uint32_t)(intptr_t)AS_OBJECT(value)) >> 4)| (((uint32_t)(intptr_t)AS_OBJECT(value)) << 28);
|
||||
}
|
||||
|
||||
|
10
test/testUnicodeIdentifiers.krk
Normal file
10
test/testUnicodeIdentifiers.krk
Normal file
@ -0,0 +1,10 @@
|
||||
def テスト(引数="こんにちは"):
|
||||
print("ああ、", 引数)
|
||||
|
||||
テスト()
|
||||
テスト("こんばんは!")
|
||||
|
||||
let おはよう = "おはようございます!"
|
||||
テスト(おはよう)
|
||||
|
||||
print(テスト)
|
4
test/testUnicodeIdentifiers.krk.expect
Normal file
4
test/testUnicodeIdentifiers.krk.expect
Normal file
@ -0,0 +1,4 @@
|
||||
ああ、 こんにちは
|
||||
ああ、 こんばんは!
|
||||
ああ、 おはようございます!
|
||||
<function テスト>
|
21
test/testUnicodeStrings.krk
Normal file
21
test/testUnicodeStrings.krk
Normal file
@ -0,0 +1,21 @@
|
||||
let hello = "おはようございます"
|
||||
print(hello)
|
||||
print(len(hello))
|
||||
|
||||
print(hello[0])
|
||||
print(hello[1])
|
||||
print(hello[-1])
|
||||
|
||||
print(hello.encode())
|
||||
print(len(hello.encode()))
|
||||
|
||||
print(hello.encode().decode())
|
||||
|
||||
let bytes = b'\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a'
|
||||
|
||||
print(bytes)
|
||||
print(bytes.decode())
|
||||
print(len(bytes.decode()))
|
||||
print(len(bytes))
|
||||
|
||||
print("テスト、テスト、日本語、おはよう".split("、"))
|
13
test/testUnicodeStrings.krk.expect
Normal file
13
test/testUnicodeStrings.krk.expect
Normal file
@ -0,0 +1,13 @@
|
||||
おはようございます
|
||||
9
|
||||
お
|
||||
は
|
||||
す
|
||||
b'\xe3\x81\x8a\xe3\x81\xaf\xe3\x82\x88\xe3\x81\x86\xe3\x81\x94\xe3\x81\x96\xe3\x81\x84\xe3\x81\xbe\xe3\x81\x99'
|
||||
27
|
||||
おはようございます
|
||||
b'\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a'
|
||||
あいうえお
|
||||
5
|
||||
15
|
||||
['テスト', 'テスト', '日本語', 'おはよう']
|
1
value.c
1
value.c
@ -83,6 +83,7 @@ void krk_printValueSafe(FILE * f, KrkValue printable) {
|
||||
case OBJ_INSTANCE: fprintf(f, "<instance of %s>", AS_INSTANCE(printable)->_class->name->chars); break;
|
||||
case OBJ_NATIVE: fprintf(f, "<nativefn %s>", ((KrkNative*)AS_OBJECT(printable))->name); break;
|
||||
case OBJ_CLOSURE: fprintf(f, "<function %s>", AS_CLOSURE(printable)->function->name->chars); break;
|
||||
case OBJ_BYTES: fprintf(f, "<bytes of len %ld>", AS_BYTES(printable)->length); break;
|
||||
case OBJ_TUPLE: {
|
||||
fprintf(f, "<tuple (");
|
||||
for (size_t i = 0; i < AS_TUPLE(printable)->values.count; ++i) {
|
||||
|
403
vm.c
403
vm.c
@ -795,6 +795,8 @@ KrkValue krk_typeOf(int argc, KrkValue argv[]) {
|
||||
return OBJECT_VAL(vm.baseClasses.strClass);
|
||||
case OBJ_TUPLE:
|
||||
return OBJECT_VAL(vm.baseClasses.tupleClass);
|
||||
case OBJ_BYTES:
|
||||
return OBJECT_VAL(vm.baseClasses.bytesClass);
|
||||
case OBJ_INSTANCE:
|
||||
return OBJECT_VAL(AS_INSTANCE(argv[0])->_class);
|
||||
default:
|
||||
@ -884,7 +886,7 @@ static KrkValue krk_globals(int argc, KrkValue argv[]) {
|
||||
KrkValue dict = krk_dict_of(0, NULL);
|
||||
krk_push(dict);
|
||||
/* Get its internal table */
|
||||
KrkValue _dict_internal = OBJECT_VAL(AS_INSTANCE(argv[0])->_internal);
|
||||
KrkValue _dict_internal = OBJECT_VAL(AS_INSTANCE(krk_peek(0))->_internal);
|
||||
/* Copy the globals table into it */
|
||||
krk_tableAddAll(vm.frames[vm.frameCount-1].globals, &AS_CLASS(_dict_internal)->methods);
|
||||
krk_pop();
|
||||
@ -1457,20 +1459,38 @@ static KrkValue _int_to_floating(int argc, KrkValue argv[]) {
|
||||
|
||||
/* int.__chr__() */
|
||||
static KrkValue _int_to_char(int argc, KrkValue argv[]) {
|
||||
char tmp[2] = {AS_INTEGER(argv[0]), 0};
|
||||
return OBJECT_VAL(krk_copyString(tmp,1));
|
||||
long value = AS_INTEGER(argv[0]);
|
||||
unsigned char out[5] = {0};
|
||||
if (value > 0xFFFF) {
|
||||
out[0] = (0xF0 | (value >> 18));
|
||||
out[1] = (0x80 | ((value >> 12) & 0x3F));
|
||||
out[2] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[3] = (0x80 | ((value) & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,4));
|
||||
} else if (value > 0x7FF) {
|
||||
out[0] = (0xE0 | (value >> 12));
|
||||
out[1] = (0x80 | ((value >> 6) & 0x3F));
|
||||
out[2] = (0x80 | (value & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,3));
|
||||
} else if (value > 0x7F) {
|
||||
out[0] = (0xC0 | (value >> 6));
|
||||
out[1] = (0x80 | (value & 0x3F));
|
||||
return OBJECT_VAL(krk_copyString((char*)out,2));
|
||||
} else {
|
||||
out[0] = (unsigned char)value;
|
||||
return OBJECT_VAL(krk_copyString((char*)out,1));
|
||||
}
|
||||
}
|
||||
|
||||
/* str.__ord__() */
|
||||
static KrkValue _char_to_int(int argc, KrkValue argv[]) {
|
||||
if (AS_STRING(argv[0])->length != 1) {
|
||||
static KrkValue _string_ord(int argc, KrkValue argv[]) {
|
||||
if (AS_STRING(argv[0])->codesLength != 1) {
|
||||
krk_runtimeError(vm.exceptions.typeError, "ord() expected a character, but string of length %d found",
|
||||
AS_STRING(argv[0])->length);
|
||||
AS_STRING(argv[0])->codesLength);
|
||||
return NONE_VAL();
|
||||
}
|
||||
|
||||
/* TODO unicode strings? Interpret as UTF-8 and return codepoint? */
|
||||
return INTEGER_VAL(((unsigned char)AS_CSTRING(argv[0])[0]));
|
||||
return INTEGER_VAL(krk_unicodeCodepoint(AS_STRING(argv[0]),0));
|
||||
}
|
||||
|
||||
static KrkValue _print(int argc, KrkValue argv[], int hasKw) {
|
||||
@ -1516,7 +1536,7 @@ static KrkValue _print(int argc, KrkValue argv[], int hasKw) {
|
||||
}
|
||||
|
||||
/* str.__len__() */
|
||||
static KrkValue _string_length(int argc, KrkValue argv[]) {
|
||||
static KrkValue _string_len(int argc, KrkValue argv[]) {
|
||||
if (argc != 1) {
|
||||
krk_runtimeError(vm.exceptions.attributeError,"Unexpected arguments to str.__len__()");
|
||||
return NONE_VAL();
|
||||
@ -1524,7 +1544,7 @@ static KrkValue _string_length(int argc, KrkValue argv[]) {
|
||||
if (!IS_STRING(argv[0])) {
|
||||
return NONE_VAL();
|
||||
}
|
||||
return INTEGER_VAL(AS_STRING(argv[0])->length);
|
||||
return INTEGER_VAL(AS_STRING(argv[0])->codesLength);
|
||||
}
|
||||
|
||||
/* str.__set__(ind,val) - this is invalid, throw a nicer error than 'field does not exist'. */
|
||||
@ -1540,7 +1560,7 @@ static KrkValue _strings_are_immutable(int argc, KrkValue argv[]) {
|
||||
* somewhere else? I'm not even sure where Python does do it, but a quick
|
||||
* says not if you call __getslice__ directly...
|
||||
*/
|
||||
static KrkValue _string_get_slice(int argc, KrkValue argv[]) {
|
||||
static KrkValue _string_getslice(int argc, KrkValue argv[]) {
|
||||
if (argc < 3) { /* 3 because first is us */
|
||||
krk_runtimeError(vm.exceptions.argumentError, "slice: expected 2 arguments, got %d", argc-1);
|
||||
return NONE_VAL();
|
||||
@ -1553,20 +1573,36 @@ static KrkValue _string_get_slice(int argc, KrkValue argv[]) {
|
||||
}
|
||||
/* bounds check */
|
||||
KrkString * me = AS_STRING(argv[0]);
|
||||
int start = IS_NONE(argv[1]) ? 0 : AS_INTEGER(argv[1]);
|
||||
int end = IS_NONE(argv[2]) ? (int)me->length : AS_INTEGER(argv[2]);
|
||||
if (start < 0) start = me->length + start;
|
||||
long start = IS_NONE(argv[1]) ? 0 : AS_INTEGER(argv[1]);
|
||||
long end = IS_NONE(argv[2]) ? (long)me->codesLength : AS_INTEGER(argv[2]);
|
||||
if (start < 0) start = me->codesLength + start;
|
||||
if (start < 0) start = 0;
|
||||
if (end < 0) end = me->length + end;
|
||||
if (start > (int)me->length) start = me->length;
|
||||
if (end > (int)me->length) end = me->length;
|
||||
if (end < 0) end = me->codesLength + end;
|
||||
if (start > (long)me->codesLength) start = me->codesLength;
|
||||
if (end > (long)me->codesLength) end = me->codesLength;
|
||||
if (end < start) end = start;
|
||||
int len = end - start;
|
||||
return OBJECT_VAL(krk_copyString(me->chars + start, len));
|
||||
long len = end - start;
|
||||
if (me->type == KRK_STRING_ASCII) {
|
||||
return OBJECT_VAL(krk_copyString(me->chars + start, len));
|
||||
} else {
|
||||
size_t offset = 0;
|
||||
size_t length = 0;
|
||||
/* Figure out where the UTF8 for this string starts. */
|
||||
krk_unicodeString(me);
|
||||
for (long i = 0; i < start; ++i) {
|
||||
uint32_t cp = KRK_STRING_FAST(me,i);
|
||||
offset += CODEPOINT_BYTES(cp);
|
||||
}
|
||||
for (long i = start; i < end; ++i) {
|
||||
uint32_t cp = KRK_STRING_FAST(me,i);
|
||||
length += CODEPOINT_BYTES(cp);
|
||||
}
|
||||
return OBJECT_VAL(krk_copyString(me->chars + offset, length));
|
||||
}
|
||||
}
|
||||
|
||||
/* str.__int__(base=10) */
|
||||
static KrkValue _string_to_int(int argc, KrkValue argv[]) {
|
||||
static KrkValue _string_int(int argc, KrkValue argv[]) {
|
||||
if (argc < 1 || argc > 2 || !IS_STRING(argv[0])) return NONE_VAL();
|
||||
int base = (argc == 1) ? 10 : (int)AS_INTEGER(argv[1]);
|
||||
char * start = AS_CSTRING(argv[0]);
|
||||
@ -1587,7 +1623,7 @@ static KrkValue _string_to_int(int argc, KrkValue argv[]) {
|
||||
}
|
||||
|
||||
/* str.__float__() */
|
||||
static KrkValue _string_to_float(int argc, KrkValue argv[]) {
|
||||
static KrkValue _string_float(int argc, KrkValue argv[]) {
|
||||
if (argc != 1 || !IS_STRING(argv[0])) return NONE_VAL();
|
||||
return FLOATING_VAL(strtod(AS_CSTRING(argv[0]),NULL));
|
||||
}
|
||||
@ -1598,7 +1634,7 @@ static KrkValue _float_init(int argc, KrkValue argv[]) {
|
||||
krk_runtimeError(vm.exceptions.argumentError, "float() takes at most 1 argument");
|
||||
return NONE_VAL();
|
||||
}
|
||||
if (IS_STRING(argv[1])) return _string_to_float(1,&argv[1]);
|
||||
if (IS_STRING(argv[1])) return _string_float(1,&argv[1]);
|
||||
if (IS_FLOATING(argv[1])) return argv[1];
|
||||
if (IS_INTEGER(argv[1])) return FLOATING_VAL(AS_INTEGER(argv[1]));
|
||||
if (IS_BOOLEAN(argv[1])) return FLOATING_VAL(AS_BOOLEAN(argv[1]));
|
||||
@ -1622,12 +1658,26 @@ static KrkValue _string_get(int argc, KrkValue argv[]) {
|
||||
}
|
||||
KrkString * me = AS_STRING(argv[0]);
|
||||
int asInt = AS_INTEGER(argv[1]);
|
||||
if (asInt < 0) asInt += (int)AS_STRING(argv[0])->length;
|
||||
if (asInt < 0 || asInt >= (int)AS_STRING(argv[0])->length) {
|
||||
if (asInt < 0) asInt += (int)AS_STRING(argv[0])->codesLength;
|
||||
if (asInt < 0 || asInt >= (int)AS_STRING(argv[0])->codesLength) {
|
||||
krk_runtimeError(vm.exceptions.indexError, "String index out of range: %d", asInt);
|
||||
return NONE_VAL();
|
||||
}
|
||||
return OBJECT_VAL(krk_copyString((char[]){me->chars[asInt]},1));
|
||||
if (me->type == KRK_STRING_ASCII) {
|
||||
return OBJECT_VAL(krk_copyString(me->chars + asInt, 1));
|
||||
} else {
|
||||
size_t offset = 0;
|
||||
size_t length = 0;
|
||||
/* Figure out where the UTF8 for this string starts. */
|
||||
krk_unicodeString(me);
|
||||
for (long i = 0; i < asInt; ++i) {
|
||||
uint32_t cp = KRK_STRING_FAST(me,i);
|
||||
offset += CODEPOINT_BYTES(cp);
|
||||
}
|
||||
uint32_t cp = KRK_STRING_FAST(me,asInt);
|
||||
length = CODEPOINT_BYTES(cp);
|
||||
return OBJECT_VAL(krk_copyString(me->chars + offset, length));
|
||||
}
|
||||
}
|
||||
|
||||
#define PUSH_CHAR(c) do { if (stringCapacity < stringLength + 1) { \
|
||||
@ -1639,6 +1689,10 @@ static KrkValue _string_get(int argc, KrkValue argv[]) {
|
||||
/* str.format(**kwargs) */
|
||||
static KrkValue _string_format(int argc, KrkValue argv[], int hasKw) {
|
||||
if (!IS_STRING(argv[0])) return NONE_VAL();
|
||||
if (AS_STRING(argv[0])->type != KRK_STRING_ASCII) {
|
||||
krk_runtimeError(vm.exceptions.notImplementedError, "Unable to call .format() on non-ASCII string.");
|
||||
return NONE_VAL();
|
||||
}
|
||||
KrkString * self = AS_STRING(argv[0]);
|
||||
KrkValue kwargs = NONE_VAL();
|
||||
if (hasKw) {
|
||||
@ -1878,6 +1932,10 @@ static int charIn(char c, const char * str) {
|
||||
*/
|
||||
static KrkValue _string_strip_shared(int argc, KrkValue argv[], int which) {
|
||||
if (!IS_STRING(argv[0])) return NONE_VAL();
|
||||
if (AS_STRING(argv[0])->type != KRK_STRING_ASCII) {
|
||||
krk_runtimeError(vm.exceptions.notImplementedError, "str.strip() not implemented for Unicode strings");
|
||||
return NONE_VAL();
|
||||
}
|
||||
size_t start = 0;
|
||||
size_t end = AS_STRING(argv[0])->length;
|
||||
const char * subset = " \t\n\r";
|
||||
@ -2061,12 +2119,180 @@ static KrkValue _string_split(int argc, KrkValue argv[], int hasKw) {
|
||||
krk_pop();
|
||||
return myList;
|
||||
}
|
||||
|
||||
/**
|
||||
* str.__repr__()
|
||||
*
|
||||
* Strings are special because __str__ should do nothing but __repr__
|
||||
* should escape characters like quotes.
|
||||
*/
|
||||
static KrkValue _string_repr(int argc, KrkValue argv[]) {
|
||||
size_t stringCapacity = 0;
|
||||
size_t stringLength = 0;
|
||||
char * stringBytes = NULL;
|
||||
|
||||
PUSH_CHAR('\'');
|
||||
|
||||
char * end = AS_CSTRING(argv[0]) + AS_STRING(argv[0])->length;
|
||||
for (char * c = AS_CSTRING(argv[0]); c < end; ++c) {
|
||||
switch (*c) {
|
||||
/* XXX: Other non-printables should probably be escaped as well. */
|
||||
case '\\': PUSH_CHAR('\\'); PUSH_CHAR('\\'); break;
|
||||
case '\'': PUSH_CHAR('\\'); PUSH_CHAR('\''); break;
|
||||
case '\a': PUSH_CHAR('\\'); PUSH_CHAR('a'); break;
|
||||
case '\b': PUSH_CHAR('\\'); PUSH_CHAR('b'); break;
|
||||
case '\f': PUSH_CHAR('\\'); PUSH_CHAR('f'); break;
|
||||
case '\n': PUSH_CHAR('\\'); PUSH_CHAR('n'); break;
|
||||
case '\r': PUSH_CHAR('\\'); PUSH_CHAR('r'); break;
|
||||
case '\t': PUSH_CHAR('\\'); PUSH_CHAR('t'); break;
|
||||
case '\v': PUSH_CHAR('\\'); PUSH_CHAR('v'); break;
|
||||
case 27: PUSH_CHAR('\\'); PUSH_CHAR('['); break;
|
||||
default: {
|
||||
if ((unsigned char)*c < ' ' || (unsigned char)*c == 0x7F) {
|
||||
PUSH_CHAR('\\');
|
||||
PUSH_CHAR('x');
|
||||
char hex[3];
|
||||
sprintf(hex,"%02x", (unsigned char)*c);
|
||||
PUSH_CHAR(hex[0]);
|
||||
PUSH_CHAR(hex[1]);
|
||||
} else {
|
||||
PUSH_CHAR(*c);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUSH_CHAR('\'');
|
||||
KrkValue tmp = OBJECT_VAL(krk_copyString(stringBytes, stringLength));
|
||||
if (stringBytes) FREE_ARRAY(char,stringBytes,stringCapacity);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
static KrkValue _string_encode(int argc, KrkValue argv[]) {
|
||||
return OBJECT_VAL(krk_newBytes(AS_STRING(argv[0])->length, (uint8_t*)AS_CSTRING(argv[0])));
|
||||
}
|
||||
|
||||
static KrkValue _bytes_init(int argc, KrkValue argv[]) {
|
||||
if (argc == 1) {
|
||||
return OBJECT_VAL(krk_newBytes(0,NULL));
|
||||
}
|
||||
|
||||
if (IS_TUPLE(argv[1])) {
|
||||
KrkBytes * out = krk_newBytes(AS_TUPLE(argv[1])->values.count, NULL);
|
||||
krk_push(OBJECT_VAL(out));
|
||||
for (size_t i = 0; i < AS_TUPLE(argv[1])->values.count; ++i) {
|
||||
if (!IS_INTEGER(AS_TUPLE(argv[1])->values.values[i])) {
|
||||
krk_runtimeError(vm.exceptions.typeError, "bytes(): expected tuple of ints, not of '%s'", krk_typeName(AS_TUPLE(argv[1])->values.values[i]));
|
||||
return NONE_VAL();
|
||||
}
|
||||
out->bytes[i] = AS_INTEGER(AS_TUPLE(argv[1])->values.values[i]);
|
||||
}
|
||||
krk_bytesUpdateHash(out);
|
||||
return krk_pop();
|
||||
}
|
||||
|
||||
krk_runtimeError(vm.exceptions.typeError, "Can not convert '%s' to bytes", krk_typeName(argv[1]));
|
||||
return NONE_VAL();
|
||||
}
|
||||
|
||||
/* bytes objects are not interned; need to do this the old-fashioned way. */
|
||||
static KrkValue _bytes_eq(int argc, KrkValue argv[]) {
|
||||
if (!IS_BYTES(argv[1])) return BOOLEAN_VAL(0);
|
||||
KrkBytes * self = AS_BYTES(argv[0]);
|
||||
KrkBytes * them = AS_BYTES(argv[1]);
|
||||
if (self->length != them->length) return BOOLEAN_VAL(0);
|
||||
if (self->hash != them->hash) return BOOLEAN_VAL(0);
|
||||
for (size_t i = 0; i < self->length; ++i) {
|
||||
if (self->bytes[i] != them->bytes[i]) return BOOLEAN_VAL(0);
|
||||
}
|
||||
return BOOLEAN_VAL(1);
|
||||
}
|
||||
|
||||
static KrkValue _bytes_repr(int argc, KrkValue argv[]) {
|
||||
size_t stringCapacity = 0;
|
||||
size_t stringLength = 0;
|
||||
char * stringBytes = NULL;
|
||||
|
||||
PUSH_CHAR('b');
|
||||
PUSH_CHAR('\'');
|
||||
|
||||
for (size_t i = 0; i < AS_BYTES(argv[0])->length; ++i) {
|
||||
uint8_t ch = AS_BYTES(argv[0])->bytes[i];
|
||||
switch (ch) {
|
||||
case '\\': PUSH_CHAR('\\'); PUSH_CHAR('\\'); break;
|
||||
case '\'': PUSH_CHAR('\\'); PUSH_CHAR('\''); break;
|
||||
case '\a': PUSH_CHAR('\\'); PUSH_CHAR('a'); break;
|
||||
case '\b': PUSH_CHAR('\\'); PUSH_CHAR('b'); break;
|
||||
case '\f': PUSH_CHAR('\\'); PUSH_CHAR('f'); break;
|
||||
case '\n': PUSH_CHAR('\\'); PUSH_CHAR('n'); break;
|
||||
case '\r': PUSH_CHAR('\\'); PUSH_CHAR('r'); break;
|
||||
case '\t': PUSH_CHAR('\\'); PUSH_CHAR('t'); break;
|
||||
case '\v': PUSH_CHAR('\\'); PUSH_CHAR('v'); break;
|
||||
default: {
|
||||
if (ch < ' ' || ch >= 0x7F) {
|
||||
PUSH_CHAR('\\');
|
||||
PUSH_CHAR('x');
|
||||
char hex[3];
|
||||
sprintf(hex,"%02x", ch);
|
||||
PUSH_CHAR(hex[0]);
|
||||
PUSH_CHAR(hex[1]);
|
||||
} else {
|
||||
PUSH_CHAR(ch);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUSH_CHAR('\'');
|
||||
|
||||
KrkValue tmp = OBJECT_VAL(krk_copyString(stringBytes, stringLength));
|
||||
if (stringBytes) FREE_ARRAY(char,stringBytes,stringCapacity);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
static KrkValue _bytes_get(int argc, KrkValue argv[]) {
|
||||
if (argc < 2) {
|
||||
krk_runtimeError(vm.exceptions.argumentError, "bytes.__get__(): expected one argument");
|
||||
return NONE_VAL();
|
||||
}
|
||||
KrkBytes * self = AS_BYTES(argv[0]);
|
||||
long asInt = AS_INTEGER(argv[1]);
|
||||
|
||||
if (asInt < 0) asInt += (long)self->length;
|
||||
if (asInt < 0 || asInt >= (long)self->length) {
|
||||
krk_runtimeError(vm.exceptions.indexError, "bytes index out of range: %ld", asInt);
|
||||
return NONE_VAL();
|
||||
}
|
||||
|
||||
return INTEGER_VAL(self->bytes[asInt]);
|
||||
}
|
||||
|
||||
static KrkValue _bytes_len(int argc, KrkValue argv[]) {
|
||||
return INTEGER_VAL(AS_BYTES(argv[0])->length);
|
||||
}
|
||||
|
||||
static KrkValue _bytes_contains(int argc, KrkValue argv[]) {
|
||||
if (argc < 2) {
|
||||
krk_runtimeError(vm.exceptions.argumentError, "bytes.__contains__(): expected one argument");
|
||||
return NONE_VAL();
|
||||
}
|
||||
krk_runtimeError(vm.exceptions.notImplementedError, "not implemented");
|
||||
return NONE_VAL();
|
||||
}
|
||||
|
||||
static KrkValue _bytes_decode(int argc, KrkValue argv[]) {
|
||||
/* TODO: Actually bother checking if this explodes, or support other encodings... */
|
||||
return OBJECT_VAL(krk_copyString((char*)AS_BYTES(argv[0])->bytes, AS_BYTES(argv[0])->length));
|
||||
}
|
||||
|
||||
#undef PUSH_CHAR
|
||||
|
||||
static KrkValue _int_init(int argc, KrkValue argv[]) {
|
||||
if (argc < 2) return INTEGER_VAL(0);
|
||||
if (IS_INTEGER(argv[1])) return argv[1];
|
||||
if (IS_STRING(argv[1])) return _string_to_int(argc-1,&argv[1]);
|
||||
if (IS_STRING(argv[1])) return _string_int(argc-1,&argv[1]);
|
||||
if (IS_FLOATING(argv[1])) return INTEGER_VAL(AS_FLOATING(argv[1]));
|
||||
if (IS_BOOLEAN(argv[1])) return INTEGER_VAL(AS_BOOLEAN(argv[1]));
|
||||
krk_runtimeError(vm.exceptions.typeError, "int() argument must be a string or a number, not '%s'", krk_typeName(argv[1]));
|
||||
@ -2308,48 +2534,6 @@ static KrkValue _module_repr(int argc, KrkValue argv[]) {
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* str.__repr__()
|
||||
*
|
||||
* Strings are special because __str__ should do nothing but __repr__
|
||||
* should escape characters like quotes.
|
||||
*/
|
||||
static KrkValue _repr_str(int argc, KrkValue argv[]) {
|
||||
char * str = malloc(3 + AS_STRING(argv[0])->length * 4); /* x 4 because a string of all < 32s would be a lot of \xXX */
|
||||
char * tmp = str;
|
||||
*(tmp++) = '\'';
|
||||
char * end = AS_CSTRING(argv[0]) + AS_STRING(argv[0])->length;
|
||||
for (char * c = AS_CSTRING(argv[0]); c < end; ++c) {
|
||||
switch (*c) {
|
||||
/* XXX: Other non-printables should probably be escaped as well. */
|
||||
case '\n': *(tmp++) = '\\'; *(tmp++) = 'n'; break;
|
||||
case '\r': *(tmp++) = '\\'; *(tmp++) = 'r'; break;
|
||||
case '\t': *(tmp++) = '\\'; *(tmp++) = 't'; break;
|
||||
case '\'': *(tmp++) = '\\'; *(tmp++) = '\''; break;
|
||||
case '\\': *(tmp++) = '\\'; *(tmp++) = '\\'; break;
|
||||
case 27: *(tmp++) = '\\'; *(tmp++) = '['; break;
|
||||
default: {
|
||||
if ((unsigned char)*c < ' ') {
|
||||
*(tmp++) = '\\';
|
||||
*(tmp++) = 'x';
|
||||
char hex[3];
|
||||
sprintf(hex,"%02x", (unsigned char)*c);
|
||||
*(tmp++) = hex[0];
|
||||
*(tmp++) = hex[1];
|
||||
} else {
|
||||
*(tmp++) = *c;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
*(tmp++) = '\'';
|
||||
*(tmp++) = '\0';
|
||||
KrkString * out = krk_copyString(str, tmp-str-1);
|
||||
free(str);
|
||||
return OBJECT_VAL(out);
|
||||
}
|
||||
|
||||
/**
|
||||
* int.__str__()
|
||||
*
|
||||
@ -2398,7 +2582,7 @@ static int isFalsey(KrkValue value) {
|
||||
case VAL_FLOATING: return !AS_FLOATING(value);
|
||||
case VAL_OBJECT: {
|
||||
switch (AS_OBJECT(value)->type) {
|
||||
case OBJ_STRING: return !AS_STRING(value)->length;
|
||||
case OBJ_STRING: return !AS_STRING(value)->codesLength;
|
||||
case OBJ_TUPLE: return !AS_TUPLE(value)->values.count;
|
||||
default: break;
|
||||
}
|
||||
@ -2437,7 +2621,7 @@ static KrkValue _len(int argc, KrkValue argv[]) {
|
||||
return NONE_VAL();
|
||||
}
|
||||
/* Shortcuts */
|
||||
if (IS_STRING(argv[0])) return INTEGER_VAL(AS_STRING(argv[0])->length);
|
||||
if (IS_STRING(argv[0])) return INTEGER_VAL(AS_STRING(argv[0])->codesLength);
|
||||
if (IS_TUPLE(argv[0])) return INTEGER_VAL(AS_TUPLE(argv[0])->values.count);
|
||||
|
||||
KrkClass * type = AS_CLASS(krk_typeOf(1,&argv[0]));
|
||||
@ -2546,7 +2730,7 @@ _corrupt:
|
||||
return NONE_VAL();
|
||||
}
|
||||
|
||||
static KrkValue _str_iter(int argc, KrkValue argv[]) {
|
||||
static KrkValue _string_iter(int argc, KrkValue argv[]) {
|
||||
KrkInstance * output = krk_newInstance(vm.baseClasses.striteratorClass);
|
||||
|
||||
krk_push(OBJECT_VAL(output));
|
||||
@ -2721,6 +2905,57 @@ static KrkValue krk_collectGarbage_wrapper(int argc, KrkValue argv[]) {
|
||||
return INTEGER_VAL(krk_collectGarbage());
|
||||
}
|
||||
|
||||
static KrkValue krk_getsize(int argc, KrkValue argv[]) {
|
||||
if (argc < 1) return INTEGER_VAL(0);
|
||||
if (!IS_OBJECT(argv[0])) return INTEGER_VAL(sizeof(KrkValue));
|
||||
size_t mySize = sizeof(KrkValue);
|
||||
switch (AS_OBJECT(argv[0])->type) {
|
||||
case OBJ_STRING: {
|
||||
KrkString * self = AS_STRING(argv[0]);
|
||||
mySize += sizeof(KrkString) + self->length /* For the UTF8 */
|
||||
+ ((self->codes && (self->chars != self->codes)) ? (self->type * self->codesLength) : 0);
|
||||
break;
|
||||
}
|
||||
case OBJ_BYTES: {
|
||||
KrkBytes * self = AS_BYTES(argv[0]);
|
||||
mySize += sizeof(KrkBytes) + self->length;
|
||||
break;
|
||||
}
|
||||
case OBJ_INSTANCE: {
|
||||
KrkInstance * self = AS_INSTANCE(argv[0]);
|
||||
mySize += sizeof(KrkInstance) + sizeof(KrkTableEntry) * self->fields.capacity;
|
||||
break;
|
||||
}
|
||||
case OBJ_CLASS: {
|
||||
KrkClass * self = AS_CLASS(argv[0]);
|
||||
mySize += sizeof(KrkClass) + sizeof(KrkTableEntry) * self->fields.capacity
|
||||
+ sizeof(KrkTableEntry) * self->methods.capacity;
|
||||
break;
|
||||
}
|
||||
case OBJ_NATIVE: {
|
||||
KrkNative * self = (KrkNative*)AS_OBJECT(argv[0]);
|
||||
mySize += sizeof(KrkNative) + strlen(self->name) + 1;
|
||||
break;
|
||||
}
|
||||
case OBJ_TUPLE: {
|
||||
KrkTuple * self = AS_TUPLE(argv[0]);
|
||||
mySize += sizeof(KrkTuple) + sizeof(KrkValue) * self->values.capacity;
|
||||
break;
|
||||
}
|
||||
case OBJ_BOUND_METHOD: {
|
||||
mySize += sizeof(KrkBoundMethod);
|
||||
break;
|
||||
}
|
||||
case OBJ_CLOSURE: {
|
||||
KrkClosure * self = AS_CLOSURE(argv[0]);
|
||||
mySize += sizeof(KrkClosure) + sizeof(KrkUpvalue*) * self->function->upvalueCount;
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
return INTEGER_VAL(mySize);
|
||||
}
|
||||
|
||||
void krk_initVM(int flags) {
|
||||
vm.flags = flags;
|
||||
KRK_PAUSE_GC();
|
||||
@ -2807,6 +3042,7 @@ void krk_initVM(int flags) {
|
||||
(KrkObj*)S(KRK_VERSION_MAJOR "." KRK_VERSION_MINOR "." KRK_VERSION_PATCH KRK_VERSION_EXTRA));
|
||||
krk_attachNamedObject(&vm.system->fields, "buildenv", (KrkObj*)S(KRK_BUILD_COMPILER));
|
||||
krk_attachNamedObject(&vm.system->fields, "builddate", (KrkObj*)S(KRK_BUILD_DATE));
|
||||
krk_defineNative(&vm.system->fields, "getsizeof", krk_getsize);
|
||||
|
||||
KrkInstance * gcModule = krk_newInstance(vm.moduleClass);
|
||||
krk_attachNamedObject(&vm.modules, "gc", (KrkObj*)gcModule);
|
||||
@ -2869,16 +3105,16 @@ void krk_initVM(int flags) {
|
||||
ADD_BASE_CLASS(vm.baseClasses.strClass, "str", vm.objectClass);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__init__", _string_init);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__str__", _noop);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__repr__", _repr_str);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__len__", _string_length);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__repr__", _string_repr);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__len__", _string_len);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__get__", _string_get);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__set__", _strings_are_immutable);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__int__", _string_to_int);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__float__", _string_to_float);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__getslice__", _string_get_slice);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__ord__", _char_to_int);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__int__", _string_int);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__float__", _string_float);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__getslice__", _string_getslice);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__ord__", _string_ord);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__contains__", _string_contains);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__iter__", _str_iter);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__iter__", _string_iter);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".format", _string_format);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".join", _string_join);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".split", _string_split);
|
||||
@ -2889,6 +3125,7 @@ void krk_initVM(int flags) {
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__gt__", _string_gt);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__mod__", _string_mod);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".__add__", _string_add);
|
||||
krk_defineNative(&vm.baseClasses.strClass->methods, ".encode", _string_encode);
|
||||
krk_finalizeClass(vm.baseClasses.strClass);
|
||||
/* TODO: Don't attach */
|
||||
ADD_BASE_CLASS(vm.baseClasses.functionClass, "function", vm.objectClass);
|
||||
@ -2917,6 +3154,16 @@ void krk_initVM(int flags) {
|
||||
krk_defineNative(&vm.baseClasses.tupleClass->methods, ".__contains__", _tuple_contains);
|
||||
krk_defineNative(&vm.baseClasses.tupleClass->methods, ".__iter__", _tuple_iter);
|
||||
krk_finalizeClass(vm.baseClasses.tupleClass);
|
||||
ADD_BASE_CLASS(vm.baseClasses.bytesClass, "bytes", vm.objectClass);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__init__", _bytes_init);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__str__", _bytes_repr);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__repr__", _bytes_repr);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".decode", _bytes_decode);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__len__", _bytes_len);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__contains__", _bytes_contains);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__get__", _bytes_get);
|
||||
krk_defineNative(&vm.baseClasses.bytesClass->methods, ".__eq__", _bytes_eq);
|
||||
krk_finalizeClass(vm.baseClasses.bytesClass);
|
||||
|
||||
/* Build global builtin functions. */
|
||||
BUILTIN_FUNCTION("listOf", krk_list_of); /* Equivalent to list() */
|
||||
|
Loading…
Reference in New Issue
Block a user