From f37ac613a835c8ff28a2f23abe14c88fbac8b039 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 5 Sep 2024 12:42:33 -0400 Subject: [PATCH] Prevent mis-encoding of "trailing junk after numeric literal" errors. Since commit 2549f0661, we reject an identifier immediately following a numeric literal (without separating whitespace), because that risks ambiguity with hex/octal/binary integers. However, that patch used token patterns like "{integer}{ident_start}", which is problematic because {ident_start} matches only a single byte. If the first character after the integer is a multibyte character, this ends up with flex reporting an error message that includes a partial multibyte character. That can cause assorted bad-encoding problems downstream, both in the report to the client and in the postmaster log file. To fix, use {identifier} not {ident_start} in the "junk" token patterns, so that they will match complete multibyte characters. This seems generally better user experience quite aside from the encoding problem: for "123abc" the error message will now say that the error appeared at or near "123abc" instead of "123a". While at it, add some commentary about why these patterns exist and how they work. Report and patch by Karina Litskevich; review by Pavel Borisov. Back-patch to v15 where the problem came in. Discussion: https://postgr.es/m/CACiT8iZ_diop=0zJ7zuY3BXegJpkKK1Av-PU7xh0EDYHsa5+=g@mail.gmail.com --- src/backend/parser/scan.l | 8 ++++---- src/fe_utils/psqlscan.l | 8 ++++---- src/interfaces/ecpg/preproc/pgc.l | 8 ++++---- src/test/regress/expected/numerology.out | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 882e081aae..1638ae834c 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -398,12 +398,12 @@ decimalfail {digit}+\.\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail ({integer}|{decimal})[Ee][-+] -integer_junk {integer}{ident_start} -decimal_junk {decimal}{ident_start} -real_junk {real}{ident_start} +integer_junk {integer}{identifier} +decimal_junk {decimal}{identifier} +real_junk {real}{identifier} param \${integer} -param_junk \${integer}{ident_start} +param_junk \${integer}{identifier} other . diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index ae531ec240..6c603f6e7a 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -336,12 +336,12 @@ decimalfail {digit}+\.\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail ({integer}|{decimal})[Ee][-+] -integer_junk {integer}{ident_start} -decimal_junk {decimal}{ident_start} -real_junk {real}{ident_start} +integer_junk {integer}{identifier} +decimal_junk {decimal}{identifier} +real_junk {real}{identifier} param \${integer} -param_junk \${integer}{ident_start} +param_junk \${integer}{identifier} /* psql-specific: characters allowed in variable names */ variable_char [A-Za-z\200-\377_0-9] diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index c06b2d6f51..3b16c70bce 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -369,12 +369,12 @@ decimalfail {digit}+\.\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail ({integer}|{decimal})[Ee][-+] -integer_junk {integer}{ident_start} -decimal_junk {decimal}{ident_start} -real_junk {real}{ident_start} +integer_junk {integer}{identifier} +decimal_junk {decimal}{identifier} +real_junk {real}{identifier} param \${integer} -param_junk \${integer}{ident_start} +param_junk \${integer}{identifier} /* special characters for other dbms */ /* we have to react differently in compat mode */ diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out index 77d4843417..c369e5ff73 100644 --- a/src/test/regress/expected/numerology.out +++ b/src/test/regress/expected/numerology.out @@ -6,15 +6,15 @@ -- Trailing junk in numeric literals -- SELECT 123abc; -ERROR: trailing junk after numeric literal at or near "123a" +ERROR: trailing junk after numeric literal at or near "123abc" LINE 1: SELECT 123abc; ^ SELECT 0x0o; -ERROR: trailing junk after numeric literal at or near "0x" +ERROR: trailing junk after numeric literal at or near "0x0o" LINE 1: SELECT 0x0o; ^ SELECT 1_2_3; -ERROR: trailing junk after numeric literal at or near "1_" +ERROR: trailing junk after numeric literal at or near "1_2_3" LINE 1: SELECT 1_2_3; ^ SELECT 0.a;