Optimize JSON escaping using SIMD

Here we adjust escape_json_with_len() to make use of SIMD to allow
processing of up to 16-bytes at a time rather than processing a single
byte at a time.  This has been shown to speed up escaping of JSON
strings significantly.

Escaping is required for both JSON string properties and also the
property names themselves, so this should also help improve the speed of
the conversion from JSON into text for JSON objects that have property
names 16 or more bytes long.

Escaping JSON strings was often a significant bottleneck for longer
strings.  With these changes, some benchmarking has shown a query
performing nearly 4 times faster when escaping a JSON object with a 1MB
text property.  Tests with shorter text properties saw smaller but still
significant performance improvements.  For example, a test outputting 1024
JSON strings with a text property length ranging from 1 char to 1024 chars
became around 2 times faster.

Author: David Rowley
Reviewed-by: Melih Mutlu
Discussion: https://postgr.es/m/CAApHDvpLXwMZvbCKcdGfU9XQjGCDm7tFpRdTXuB9PVgpNUYfEQ@mail.gmail.com
This commit is contained in:
David Rowley 2024-08-05 23:16:44 +12:00
parent b5df24e520
commit ca6fde9225
3 changed files with 157 additions and 2 deletions

View File

@ -19,6 +19,7 @@
#include "funcapi.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "port/simd.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/date.h"
@ -1594,6 +1595,18 @@ escape_json(StringInfo buf, const char *str)
appendStringInfoCharMacro(buf, '"');
}
/*
* Define the number of bytes that escape_json_with_len will look ahead in the
* input string before flushing the input string to the destination buffer.
* Looking ahead too far could result in cachelines being evicted that will
* need to be reloaded in order to perform the appendBinaryStringInfo call.
* Smaller values will result in a larger number of calls to
* appendBinaryStringInfo and introduce additional function call overhead.
* Values larger than the size of L1d cache will likely result in worse
* performance.
*/
#define ESCAPE_JSON_FLUSH_AFTER 512
/*
* escape_json_with_len
* Produce a JSON string literal, properly escaping the possibly not
@ -1603,11 +1616,98 @@ escape_json(StringInfo buf, const char *str)
void
escape_json_with_len(StringInfo buf, const char *str, int len)
{
int vlen;
Assert(len >= 0);
/*
* Since we know the minimum length we'll need to append, let's just
* enlarge the buffer now rather than incrementally making more space when
* we run out. Add two extra bytes for the enclosing quotes.
*/
enlargeStringInfo(buf, len + 2);
/*
* Figure out how many bytes to process using SIMD. Round 'len' down to
* the previous multiple of sizeof(Vector8), assuming that's a power-of-2.
*/
vlen = len & (int) (~(sizeof(Vector8) - 1));
appendStringInfoCharMacro(buf, '"');
for (int i = 0; i < len; i++)
escape_json_char(buf, str[i]);
for (int i = 0, copypos = 0;;)
{
/*
* To speed this up, try searching sizeof(Vector8) bytes at once for
* special characters that we need to escape. When we find one, we
* fall out of the Vector8 loop and copy the portion we've vector
* searched and then we process sizeof(Vector8) bytes one byte at a
* time. Once done, come back and try doing vector searching again.
* We'll also process any remaining bytes at the tail end of the
* string byte-by-byte. This optimization assumes that most chunks of
* sizeof(Vector8) bytes won't contain any special characters.
*/
for (; i < vlen; i += sizeof(Vector8))
{
Vector8 chunk;
vector8_load(&chunk, (const uint8 *) &str[i]);
/*
* Break on anything less than ' ' or if we find a '"' or '\\'.
* Those need special handling. That's done in the per-byte loop.
*/
if (vector8_has_le(chunk, (unsigned char) 0x1F) ||
vector8_has(chunk, (unsigned char) '"') ||
vector8_has(chunk, (unsigned char) '\\'))
break;
#ifdef ESCAPE_JSON_FLUSH_AFTER
/*
* Flush what's been checked so far out to the destination buffer
* every so often to avoid having to re-read cachelines when
* escaping large strings.
*/
if (i - copypos >= ESCAPE_JSON_FLUSH_AFTER)
{
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
copypos = i;
}
#endif
}
/*
* Write to the destination up to the point that we've vector searched
* so far. Do this only when switching into per-byte mode rather than
* once every sizeof(Vector8) bytes.
*/
if (copypos < i)
{
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
copypos = i;
}
/*
* Per-byte loop for Vector8s containing special chars and for
* processing the tail of the string.
*/
for (int b = 0; b < sizeof(Vector8); b++)
{
/* check if we've finished */
if (i == len)
goto done;
Assert(i < len);
escape_json_char(buf, str[i++]);
}
copypos = i;
/* We're not done yet. Try the vector search again. */
}
done:
appendStringInfoCharMacro(buf, '"');
}

View File

@ -55,6 +55,54 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
"............abc\n"
(1 row)
-- Test various lengths of strings to validate SIMD processing to escape
-- special chars in the JSON.
SELECT row_to_json(j)::jsonb FROM (
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
FROM generate_series(0,37) a
) j;
row_to_json
--------------------------------------------------
{"a": ""}
{"a": "a"}
{"a": "ab"}
{"a": "abc"}
{"a": "abcd"}
{"a": "abcde"}
{"a": "abcdef"}
{"a": "abcdefg"}
{"a": "abcdefgh"}
{"a": "abcdefghi"}
{"a": "abcdefghij"}
{"a": "abcdefghijk"}
{"a": "abcdefghijkl"}
{"a": "abcdefghijklm"}
{"a": "abcdefghijklmn"}
{"a": "abcdefghijklmno"}
{"a": "abcdefghijklmnop"}
{"a": "abcdefghijklmnopq"}
{"a": "abcdefghijklmnopqr"}
{"a": "abcdefghijklmnopqrs"}
{"a": "abcdefghijklmnopqrst"}
{"a": "abcdefghijklmnopqrstu"}
{"a": "abcdefghijklmnopqrstuv"}
{"a": "abcdefghijklmnopqrstuv\""}
{"a": "abcdefghijklmnopqrstuv\"\t"}
{"a": "abcdefghijklmnopqrstuv\"\tw"}
{"a": "abcdefghijklmnopqrstuv\"\twx"}
{"a": "abcdefghijklmnopqrstuv\"\twxy"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz0"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz01"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz012"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123456"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234567"}
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345678"}
(38 rows)
-- see json_encoding test for input with unicode escapes
-- Numbers.
SELECT '1'::json; -- OK

View File

@ -12,6 +12,13 @@ SELECT '"\v"'::json; -- ERROR, not a valid JSON escape
SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK
SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
-- Test various lengths of strings to validate SIMD processing to escape
-- special chars in the JSON.
SELECT row_to_json(j)::jsonb FROM (
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
FROM generate_series(0,37) a
) j;
-- see json_encoding test for input with unicode escapes
-- Numbers.