json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state. This
can lead to suboptimal error recovery. For instance, input
0123 ,
produces the tokens
JSON_ERROR 01
JSON_INTEGER 23
JSON_COMMA ,
Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.
Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.
The lexer now produces
JSON_ERROR 01
JSON_COMMA ,
Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two. Also drop the newline after %p; it
was needed to tease out the second error.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d88
resolved]
This commit is contained in:
parent
c0ee3afa7f
commit
0f07a5d5f1
@ -101,6 +101,7 @@
|
|||||||
|
|
||||||
enum json_lexer_state {
|
enum json_lexer_state {
|
||||||
IN_ERROR = 0, /* must really be 0, see json_lexer[] */
|
IN_ERROR = 0, /* must really be 0, see json_lexer[] */
|
||||||
|
IN_RECOVERY,
|
||||||
IN_DQ_STRING_ESCAPE,
|
IN_DQ_STRING_ESCAPE,
|
||||||
IN_DQ_STRING,
|
IN_DQ_STRING,
|
||||||
IN_SQ_STRING_ESCAPE,
|
IN_SQ_STRING_ESCAPE,
|
||||||
@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
|
|||||||
static const uint8_t json_lexer[][256] = {
|
static const uint8_t json_lexer[][256] = {
|
||||||
/* Relies on default initialization to IN_ERROR! */
|
/* Relies on default initialization to IN_ERROR! */
|
||||||
|
|
||||||
|
/* error recovery */
|
||||||
|
[IN_RECOVERY] = {
|
||||||
|
/*
|
||||||
|
* Skip characters until a structural character, an ASCII
|
||||||
|
* control character other than '\t', or impossible UTF-8
|
||||||
|
* bytes '\xFE', '\xFF'. Structural characters and line
|
||||||
|
* endings are promising resynchronization points. Clients
|
||||||
|
* may use the others to force the JSON parser into known-good
|
||||||
|
* state; see docs/interop/qmp-spec.txt.
|
||||||
|
*/
|
||||||
|
[0 ... 0x1F] = IN_START | LOOKAHEAD,
|
||||||
|
[0x20 ... 0xFD] = IN_RECOVERY,
|
||||||
|
[0xFE ... 0xFF] = IN_START | LOOKAHEAD,
|
||||||
|
['\t'] = IN_RECOVERY,
|
||||||
|
['['] = IN_START | LOOKAHEAD,
|
||||||
|
[']'] = IN_START | LOOKAHEAD,
|
||||||
|
['{'] = IN_START | LOOKAHEAD,
|
||||||
|
['}'] = IN_START | LOOKAHEAD,
|
||||||
|
[':'] = IN_START | LOOKAHEAD,
|
||||||
|
[','] = IN_START | LOOKAHEAD,
|
||||||
|
},
|
||||||
|
|
||||||
/* double quote string */
|
/* double quote string */
|
||||||
[IN_DQ_STRING_ESCAPE] = {
|
[IN_DQ_STRING_ESCAPE] = {
|
||||||
[0x20 ... 0xFD] = IN_DQ_STRING,
|
[0x20 ... 0xFD] = IN_DQ_STRING,
|
||||||
@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
|
|||||||
/* fall through */
|
/* fall through */
|
||||||
case JSON_SKIP:
|
case JSON_SKIP:
|
||||||
g_string_truncate(lexer->token, 0);
|
g_string_truncate(lexer->token, 0);
|
||||||
|
/* fall through */
|
||||||
|
case IN_START:
|
||||||
new_state = lexer->start_state;
|
new_state = lexer->start_state;
|
||||||
break;
|
break;
|
||||||
case IN_ERROR:
|
case IN_ERROR:
|
||||||
/* XXX: To avoid having previous bad input leaving the parser in an
|
|
||||||
* unresponsive state where we consume unpredictable amounts of
|
|
||||||
* subsequent "good" input, percolate this error state up to the
|
|
||||||
* parser by emitting a JSON_ERROR token, then reset lexer state.
|
|
||||||
*
|
|
||||||
* Also note that this handling is required for reliable channel
|
|
||||||
* negotiation between QMP and the guest agent, since chr(0xFF)
|
|
||||||
* is placed at the beginning of certain events to ensure proper
|
|
||||||
* delivery when the channel is in an unknown state. chr(0xFF) is
|
|
||||||
* never a valid ASCII/UTF-8 sequence, so this should reliably
|
|
||||||
* induce an error/flush state.
|
|
||||||
*/
|
|
||||||
json_message_process_token(lexer, lexer->token, JSON_ERROR,
|
json_message_process_token(lexer, lexer->token, JSON_ERROR,
|
||||||
lexer->x, lexer->y);
|
lexer->x, lexer->y);
|
||||||
|
new_state = IN_RECOVERY;
|
||||||
|
/* fall through */
|
||||||
|
case IN_RECOVERY:
|
||||||
g_string_truncate(lexer->token, 0);
|
g_string_truncate(lexer->token, 0);
|
||||||
lexer->state = lexer->start_state;
|
break;
|
||||||
return;
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -76,10 +76,7 @@ static void test_malformed(QTestState *qts)
|
|||||||
assert_recovered(qts);
|
assert_recovered(qts);
|
||||||
|
|
||||||
/* lexical error: interpolation */
|
/* lexical error: interpolation */
|
||||||
qtest_qmp_send_raw(qts, "%%p\n");
|
qtest_qmp_send_raw(qts, "%%p");
|
||||||
/* two errors, one for "%", one for "p" */
|
|
||||||
resp = qtest_qmp_receive(qts);
|
|
||||||
qmp_assert_error_class(resp, "GenericError");
|
|
||||||
resp = qtest_qmp_receive(qts);
|
resp = qtest_qmp_receive(qts);
|
||||||
qmp_assert_error_class(resp, "GenericError");
|
qmp_assert_error_class(resp, "GenericError");
|
||||||
assert_recovered(qts);
|
assert_recovered(qts);
|
||||||
|
Loading…
Reference in New Issue
Block a user