mirror of https://github.com/postgres/postgres
Optimize JSON escaping using SIMD
Here we adjust escape_json_with_len() to make use of SIMD to allow processing of up to 16-bytes at a time rather than processing a single byte at a time. This has been shown to speed up escaping of JSON strings significantly. Escaping is required for both JSON string properties and also the property names themselves, so this should also help improve the speed of the conversion from JSON into text for JSON objects that have property names 16 or more bytes long. Escaping JSON strings was often a significant bottleneck for longer strings. With these changes, some benchmarking has shown a query performing nearly 4 times faster when escaping a JSON object with a 1MB text property. Tests with shorter text properties saw smaller but still significant performance improvements. For example, a test outputting 1024 JSON strings with a text property length ranging from 1 char to 1024 chars became around 2 times faster. Author: David Rowley Reviewed-by: Melih Mutlu Discussion: https://postgr.es/m/CAApHDvpLXwMZvbCKcdGfU9XQjGCDm7tFpRdTXuB9PVgpNUYfEQ@mail.gmail.com
This commit is contained in:
parent
b5df24e520
commit
ca6fde9225
|
@ -19,6 +19,7 @@
|
|||
#include "funcapi.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "port/simd.h"
|
||||
#include "utils/array.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/date.h"
|
||||
|
@ -1594,6 +1595,18 @@ escape_json(StringInfo buf, const char *str)
|
|||
appendStringInfoCharMacro(buf, '"');
|
||||
}
|
||||
|
||||
/*
|
||||
* Define the number of bytes that escape_json_with_len will look ahead in the
|
||||
* input string before flushing the input string to the destination buffer.
|
||||
* Looking ahead too far could result in cachelines being evicted that will
|
||||
* need to be reloaded in order to perform the appendBinaryStringInfo call.
|
||||
* Smaller values will result in a larger number of calls to
|
||||
* appendBinaryStringInfo and introduce additional function call overhead.
|
||||
* Values larger than the size of L1d cache will likely result in worse
|
||||
* performance.
|
||||
*/
|
||||
#define ESCAPE_JSON_FLUSH_AFTER 512
|
||||
|
||||
/*
|
||||
* escape_json_with_len
|
||||
* Produce a JSON string literal, properly escaping the possibly not
|
||||
|
@ -1603,11 +1616,98 @@ escape_json(StringInfo buf, const char *str)
|
|||
void
|
||||
escape_json_with_len(StringInfo buf, const char *str, int len)
|
||||
{
|
||||
int vlen;
|
||||
|
||||
Assert(len >= 0);
|
||||
|
||||
/*
|
||||
* Since we know the minimum length we'll need to append, let's just
|
||||
* enlarge the buffer now rather than incrementally making more space when
|
||||
* we run out. Add two extra bytes for the enclosing quotes.
|
||||
*/
|
||||
enlargeStringInfo(buf, len + 2);
|
||||
|
||||
/*
|
||||
* Figure out how many bytes to process using SIMD. Round 'len' down to
|
||||
* the previous multiple of sizeof(Vector8), assuming that's a power-of-2.
|
||||
*/
|
||||
vlen = len & (int) (~(sizeof(Vector8) - 1));
|
||||
|
||||
appendStringInfoCharMacro(buf, '"');
|
||||
|
||||
for (int i = 0; i < len; i++)
|
||||
escape_json_char(buf, str[i]);
|
||||
for (int i = 0, copypos = 0;;)
|
||||
{
|
||||
/*
|
||||
* To speed this up, try searching sizeof(Vector8) bytes at once for
|
||||
* special characters that we need to escape. When we find one, we
|
||||
* fall out of the Vector8 loop and copy the portion we've vector
|
||||
* searched and then we process sizeof(Vector8) bytes one byte at a
|
||||
* time. Once done, come back and try doing vector searching again.
|
||||
* We'll also process any remaining bytes at the tail end of the
|
||||
* string byte-by-byte. This optimization assumes that most chunks of
|
||||
* sizeof(Vector8) bytes won't contain any special characters.
|
||||
*/
|
||||
for (; i < vlen; i += sizeof(Vector8))
|
||||
{
|
||||
Vector8 chunk;
|
||||
|
||||
vector8_load(&chunk, (const uint8 *) &str[i]);
|
||||
|
||||
/*
|
||||
* Break on anything less than ' ' or if we find a '"' or '\\'.
|
||||
* Those need special handling. That's done in the per-byte loop.
|
||||
*/
|
||||
if (vector8_has_le(chunk, (unsigned char) 0x1F) ||
|
||||
vector8_has(chunk, (unsigned char) '"') ||
|
||||
vector8_has(chunk, (unsigned char) '\\'))
|
||||
break;
|
||||
|
||||
#ifdef ESCAPE_JSON_FLUSH_AFTER
|
||||
|
||||
/*
|
||||
* Flush what's been checked so far out to the destination buffer
|
||||
* every so often to avoid having to re-read cachelines when
|
||||
* escaping large strings.
|
||||
*/
|
||||
if (i - copypos >= ESCAPE_JSON_FLUSH_AFTER)
|
||||
{
|
||||
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
|
||||
copypos = i;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Write to the destination up to the point that we've vector searched
|
||||
* so far. Do this only when switching into per-byte mode rather than
|
||||
* once every sizeof(Vector8) bytes.
|
||||
*/
|
||||
if (copypos < i)
|
||||
{
|
||||
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
|
||||
copypos = i;
|
||||
}
|
||||
|
||||
/*
|
||||
* Per-byte loop for Vector8s containing special chars and for
|
||||
* processing the tail of the string.
|
||||
*/
|
||||
for (int b = 0; b < sizeof(Vector8); b++)
|
||||
{
|
||||
/* check if we've finished */
|
||||
if (i == len)
|
||||
goto done;
|
||||
|
||||
Assert(i < len);
|
||||
|
||||
escape_json_char(buf, str[i++]);
|
||||
}
|
||||
|
||||
copypos = i;
|
||||
/* We're not done yet. Try the vector search again. */
|
||||
}
|
||||
|
||||
done:
|
||||
appendStringInfoCharMacro(buf, '"');
|
||||
}
|
||||
|
||||
|
|
|
@ -55,6 +55,54 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
|
|||
"............abc\n"
|
||||
(1 row)
|
||||
|
||||
-- Test various lengths of strings to validate SIMD processing to escape
|
||||
-- special chars in the JSON.
|
||||
SELECT row_to_json(j)::jsonb FROM (
|
||||
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
|
||||
FROM generate_series(0,37) a
|
||||
) j;
|
||||
row_to_json
|
||||
--------------------------------------------------
|
||||
{"a": ""}
|
||||
{"a": "a"}
|
||||
{"a": "ab"}
|
||||
{"a": "abc"}
|
||||
{"a": "abcd"}
|
||||
{"a": "abcde"}
|
||||
{"a": "abcdef"}
|
||||
{"a": "abcdefg"}
|
||||
{"a": "abcdefgh"}
|
||||
{"a": "abcdefghi"}
|
||||
{"a": "abcdefghij"}
|
||||
{"a": "abcdefghijk"}
|
||||
{"a": "abcdefghijkl"}
|
||||
{"a": "abcdefghijklm"}
|
||||
{"a": "abcdefghijklmn"}
|
||||
{"a": "abcdefghijklmno"}
|
||||
{"a": "abcdefghijklmnop"}
|
||||
{"a": "abcdefghijklmnopq"}
|
||||
{"a": "abcdefghijklmnopqr"}
|
||||
{"a": "abcdefghijklmnopqrs"}
|
||||
{"a": "abcdefghijklmnopqrst"}
|
||||
{"a": "abcdefghijklmnopqrstu"}
|
||||
{"a": "abcdefghijklmnopqrstuv"}
|
||||
{"a": "abcdefghijklmnopqrstuv\""}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\t"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\tw"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twx"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxy"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz0"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz01"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz012"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123456"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234567"}
|
||||
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345678"}
|
||||
(38 rows)
|
||||
|
||||
-- see json_encoding test for input with unicode escapes
|
||||
-- Numbers.
|
||||
SELECT '1'::json; -- OK
|
||||
|
|
|
@ -12,6 +12,13 @@ SELECT '"\v"'::json; -- ERROR, not a valid JSON escape
|
|||
SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK
|
||||
SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
|
||||
|
||||
-- Test various lengths of strings to validate SIMD processing to escape
|
||||
-- special chars in the JSON.
|
||||
SELECT row_to_json(j)::jsonb FROM (
|
||||
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
|
||||
FROM generate_series(0,37) a
|
||||
) j;
|
||||
|
||||
-- see json_encoding test for input with unicode escapes
|
||||
|
||||
-- Numbers.
|
||||
|
|
Loading…
Reference in New Issue