PR/54424: Martijn Dekker: awk: broken character classes in UTF-8 locale:

only the first matches Pick up some of the fixes from upstream: - posix paren matching - print \v \a - some more fatal handling - init all the character range.
2019-08-01 06:22:52 +00:00 · 2019-08-01 06:22:52 +00:00 · ca889033bb
commit ca889033bb
parent a533950c00
1 changed files with 28 additions and 3 deletions
--- a/external/historical/nawk/dist/b.c
+++ b/external/historical/nawk/dist/b.c
@ -31,6 +31,7 @@ THIS SOFTWARE.
 #define	DEBUG

 #include <ctype.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@ -333,6 +334,10 @@ int quoted(const uschar **pp)	/* pick up next thing after a \\ */
 		c = '\r';
 	else if (c == 'b')
 		c = '\b';
+	else if (c == 'v')
+		c = '\v';
+	else if (c == 'a')
+		c = '\a';
 	else if (c == '\\')
 		c = '\\';
 	else if (c == 'x') {	/* hexadecimal goo follows */
@ -978,6 +983,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
 		if (firstnum < 2) {
 			/* 0 or 1: should be handled before you get here */
+			FATAL("internal error");
 		} else {
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
@ -998,6 +1004,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 		return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_WITH_Q);
 	} else {	/* Error - shouldn't be here (n>m) */
+		FATAL("internal error");
 	}
 	return 0;
 }
@ -1013,6 +1020,7 @@ int relex(void)		/* lexical analyzer for reparse */
 	int i;
 	int num, m, commafound, digitfound;
 	const uschar *startreptok;
+	static int parens = 0;

 rescan:
 	starttok = prestr;
@ -1026,9 +1034,18 @@ rescan:
 	case '\0': prestr--; return '\0';
 	case '^':
 	case '$':
-	case '(':
-	case ')':
 		return c;
+	case '(':
+		parens++;
+ 		return c;
+	case ')':
+		if (parens) {
+			parens--;
+			return c;
+		}
+		/* unmatched close parenthesis; per POSIX, treat as literal */
+		rlxval = c;
+		return CHAR;
 	case '\\':
 		rlxval = quoted(&prestr);
 		return CHAR;
@ -1064,7 +1081,15 @@ rescan:
 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
 				    prestr[2 + cc->cc_namelen] == ']') {
 					prestr += cc->cc_namelen + 3;
-					for (i = 1; i < NCHARS; i++) {
+					/*
+					 * BUG: We begin at 1, instead of 0, since we
+					 * would otherwise prematurely terminate the
+					 * string for classes like [[:cntrl:]]. This
+					 * means that we can't match the NUL character,
+					 * not without first adapting the entire
+					 * program to track each string's length.
+					 */
+					for (i = 1; i <= UCHAR_MAX; i++) {
 						if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
 						    FATAL("out of space for reg expr %.10s...", lastre);
 						if (cc->cc_func(i)) {