Fix parsing of double parentheses in compound assignments

Problem 1: $ ksh -c 'typeset -Ca arr=((a=ah b=beh c=si))' ksh: syntax error at line 1: `((' unexpected $ ksh -c 'got=$(typeset -Ca arr=((a=ah b=beh c=si)))' ksh: syntax error at line 1: `((' unexpected Problem 2: $ ksh -c 'typeset -a arr=((a b c) 1)' (no output; OK) $ ksh -c 'got=$(typeset -a arr=((a b c) 1)' ksh: syntax error at line 1: `(' unmatched (sh_lex() and comsub() go into a mutual recursion loop here, eventually breaking it because lexd.dolparen wraps around) Root cause for problem 1: sh_lex() fails to take compound assignments into account so that it throws a syntax error on finding ((...)), misinterpreting it as EXPRSYM, which is used for arithmetic commands and expansions. Root cause for problem 2: same as problem 1, plus: at parse time, command substitutions and arithmetic expansions are read without parsing, using lexical analysis only. This is okay for arithmetic expansions, but not for command substitutions: the lexer really needs to know when we're entering a compound assignment. The parser flags this up via comp_assign, but not if we're not using it. We can cleanly fix problem 1 because the parser (parse.c) sets the comp_assign flag to tell the lexer when we're processing a compound assignment. But problem 2 is much harder. The fundamental issue is that the shell language's basic design is so messy that a clean separation between lexical analysis and parsing is not possible; the parser must influence the lexer for both to function properly. But command substitutions currently avoid the parser at parse time, so this bug can only be masked with a workaround. Correctness will not be possible until we completely redesign the way in which the lexer and parser handle command substitutions. src/cmd/ksh93/sh/lex.c: - comsub(): * This misleadingly named function is also used for reading arithmetic expansions. Set a new lexd.dolparen_arithexp flag when it is doing so (save/restore it to make recursion work). * Add a workaround that tries to detect compound assignments without parsing: whenever we encounter =( ... ), assume it is one. This is not correct, but will hopefully do for now... . Upon finding LPAREN=='(', check if it is preceded by '=' and set a new dolparen_eqparen flag to the current level of parentheses if so. . Upon finding LPAREN==')', reset that flag if we're moving below that level. - sh_lex(): * When detecting a double opening parenthesis, avoid misdetecting EXPRSYM (the symbol used for arithmetic commands and expansions) by returning early if the parser has set comp_assign. * For the case when we're lexing from comsub() and the parser is not in use, use the dolparen_eqparen flag instead. For the outer instance of =( ... ) that flag will not be set yet, so to deal correctly with nested compound assignments and ((...)) within compound assignments, we must check both for that flag and direclty for a preceding '='. * Minor refactoring of << (IODOCSYM) detection for legibility. Thanks to @hyenias for the bug report and for all the testing, and to @atheik for help in putting us on the trail to a fix. Resolves: #269
ksh93 · Apr 5, 2023 · 1731f66 · 1731f66
1 parent 0ea7d1a
commit 1731f66
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 12 deletions.
diff --git a/NEWS b/NEWS
@@ -2,6 +2,13 @@ This documents significant changes in the 1.0 branch of ksh 93u+m.
 For full details, see the git log at: https://github.com/ksh93/ksh/tree/1.0
 Uppercase BUG_* IDs are shell bug IDs as used by the Modernish shell library.
 
+2023-04-05:
+
+- Fixed a spurious syntax error in compound assignments upon encountering a
+  pair of repeated opening parentheses '(('. This bug behaved differently
+  depending on whether the compound assignment was outside or within a command
+  substitution of the form $(...) or ${ ...; }. Both cases are now fixed.
+
 2023-04-03:
 
 - Fixed multiple crashing bugs in discipline functions invoked from non-forked

diff --git a/src/cmd/ksh93/include/shlex.h b/src/cmd/ksh93/include/shlex.h
@@ -45,7 +45,9 @@ struct _shlex_pvt_lexdata_
 {
 	char		nocopy;
 	char		paren;
-	char		dolparen;
+	char		dolparen;	/* set during the comsub() lexical analysis hack */
+	unsigned short	dolparen_eqparen;	/* flags up =( ... ) within a comsub */
+	char		dolparen_arithexp;	/* set while comsub() is lexing an arithmetic expansion */
 	char		nest;
 	char		docword;
 	char		nested_tilde;

diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h
@@ -18,7 +18,7 @@
 
 #define SH_RELEASE_FORK	"93u+m"		/* only change if you develop a new ksh93 fork */
 #define SH_RELEASE_SVER	"1.0.5-beta"	/* semantic version number: https://semver.org */
-#define SH_RELEASE_DATE	"2023-04-03"	/* must be in this format for $((.sh.version)) */
+#define SH_RELEASE_DATE	"2023-04-05"	/* must be in this format for $((.sh.version)) */
 #define SH_RELEASE_CPYR	"(c) 2020-2023 Contributors to ksh " SH_RELEASE_FORK
 
 /* Scripts sometimes field-split ${.sh.version}, so don't change amount of whitespace. */

diff --git a/src/cmd/ksh93/sh/lex.c b/src/cmd/ksh93/sh/lex.c
@@ -476,20 +476,29 @@ int sh_lex(Lex_t* lp)
 				{
 					if(n==c)
 					{
-						if(c=='<')
-							lp->lexd.docword=1;
-						else if(n==LPAREN)
+						if(c==LPAREN)
 						{
-							if(lp->lex.intest)
-								return c;
-							/* '((' arithmetic command */
+							/* Avoid misdetecting EXPRSYM in [[ ... ]] or compound assignments */
+							if(lp->lex.intest || lp->comp_assign)
+								return lp->token=c;
+							/* The comsub() reading hack avoids the parser, so comp_assign is never
+							 * set; try to detect compound assignments with this workaround instead */
+							if(lp->lexd.dolparen && !lp->lexd.dolparen_arithexp
+							&& (fcpeek(-2)=='=' || lp->lexd.dolparen_eqparen))
+								return lp->token=c;
+							/* OK, maybe this is EXPRSYM (arith '((', possibly following '$').
+							 * But this cannot be concluded until a final '))' is detected.
+							 * Use a recursive lexer invocation for that. */
 							lp->lexd.nest=1;
 							lp->lastline = sh.inlineno;
 							lp->lexd.lex_state = ST_NESTED;
 							fcseek(1);
 							return sh_lex(lp);
 						}
-						c  |= SYMREP;
+						c |= SYMREP;
+						/* Here document redirection operator '<<' */
+						if(c==IODOCSYM)
+							lp->lexd.docword = 1;
 					}
 					else if(c=='(' || c==')')
 						return lp->token=c;
@@ -667,7 +676,7 @@ int sh_lex(Lex_t* lp)
 				if(mode==ST_BEGIN)
 				{
 				do_reg:
-					/* skip new-line joining if not called from comsub() */
+					/* skip new-line joining if called from comsub() */
 					if(c=='\\' && fcpeek(0)=='\n' && !lp->lexd.dolparen)
 					{
 						sh.inlineno++;
@@ -1509,19 +1518,43 @@ int sh_lex(Lex_t* lp)
 
 /*
  * read to end of command substitution
- * of the form $(...)
+ * of the form $(...) or ${ ...;}
+ * or arithmetic expansion $((...))
+ *
+ * Ugly hack alert: At parse time, command substitutions and arithmetic expansions are read
+ * without parsing, using lexical analysis only. This is only to determine their length, so
+ * that their literal source text can be stored in the parse tree. They are then actually
+ * parsed at runtime (!) each time they are executed (!) via comsubst() in macro.c.
+ *
+ * This approach is okay for arithmetic expansions, but for command substitutions it is an
+ * unreliable hack. The lexer does not have real shell grammar knowledge; that's what the
+ * parser is for. However, a clean separation between lexical analysis and parsing is not
+ * possible, because the design of the shell language is fundamentally messy. So we need the
+ * parser to set the some flags in the lexer at the appropriate times to avoid spurious
+ * syntax errors (these are the non-private Lex_t struct members). But the parser obviously
+ * cannot do this if we're not using it.
+ *
+ * The comsub() hack below, along with all the dolparen checks in the lexer, tries to work
+ * around this fundamental problem as best we can to make it work in all but corner cases.
+ * It sets the lexd.dolparen, lexd.dolparen_eqparen and lexd.dolparen_arithexp flags for the
+ * rest of the lexer code to execute lots of workarounds.
+ *
+ * TODO: to achieve correctness, actually parse command substitutions at parse time.
  */
 static int comsub(Lex_t *lp, int endtok)
 {
-	int n,c,count=1;
+	int n,c;
+	unsigned short count=1;
 	int line=sh.inlineno;
 	struct ionod *inheredoc = lp->heredoc;
+	char save_arithexp = lp->lexd.dolparen_arithexp;
 	char *first,*cp=fcseek(0),word[5];
 	int off, messages=0, assignok=lp->assignok, csub;
 	struct _shlex_pvt_lexstate_ save = lp->lex;
 	csub = lp->comsub;
 	sh_lexopen(lp,1);
 	lp->lexd.dolparen++;
+	lp->lexd.dolparen_arithexp = endtok==LPAREN && fcpeek(1)==LPAREN;  /* $(( */
 	lp->lex.incase=0;
 	pushlevel(lp,0,0);
 	lp->comsub = (endtok==LBRACE);
@@ -1601,10 +1634,16 @@ static int comsub(Lex_t *lp, int endtok)
 				break;
 			    case IPROCSYM:	case OPROCSYM:
 			    case LPAREN:
+				/* lexd.dolparen_eqparen flags up "=(": we presume it's a compound assignment.
+				 * This is a workaround for <https://github.com/ksh93/ksh/issues/269>. */
+				if(!lp->lexd.dolparen_eqparen && fcpeek(-2)=='=')
+					lp->lexd.dolparen_eqparen = count;
 				if(endtok==LPAREN && !lp->lex.incase)
 					count++;
 				break;
 			    case RPAREN:
+				if(lp->lexd.dolparen_eqparen >= count)
+					lp->lexd.dolparen_eqparen = 0;
 				if(lp->lex.incase)
 					lp->lex.incase=0;
 				else if(endtok==LPAREN && --count<=0)
@@ -1646,6 +1685,7 @@ static int comsub(Lex_t *lp, int endtok)
 	lp->comsub = csub;
 	lp->lastline = line;
 	lp->lexd.dolparen--;
+	lp->lexd.dolparen_arithexp = save_arithexp;
 	lp->lex = save;
 	lp->assignok = (endchar(lp)==RBRACT?assignok:0);
 	if(lp->heredoc && !inheredoc)

diff --git a/src/cmd/ksh93/sh/macro.c b/src/cmd/ksh93/sh/macro.c
@@ -2110,6 +2110,7 @@ static int varsub(Mac_t *mp)
 
 /*
  * This routine handles command substitution
+ * and arithmetic expansion.
  * <type> is 0 for older `...` version
  * 1 for $(...) or 2 for ${ subshare; }
  */

diff --git a/src/cmd/ksh93/tests/basic.sh b/src/cmd/ksh93/tests/basic.sh
@@ -990,5 +990,34 @@ do
 	|| err_exit "last command in script exec-optimized in spite of $sig trap ($pid1 == $pid2)"
 done
 
+# ======
+# Nested compound assignment misparsed in $(...) or ${ ...; } command substitution
+# https://github.com/ksh93/ksh/issues/269
+# TODO: a few tests below crash when actually executed; test lexing only by using noexec. https://github.com/ksh93/ksh/issues/621
+for testcode in \
+	': $( typeset -a arr=((a b c) 1) )' \
+	': ${ typeset -a arr=((a b c) 1); }' \
+	': $( typeset -a arr=( ( ((a b c)1))) )' \
+	': ${ typeset -a arr=( ( ((a b c)1))); }' \
+	': $(( 1 << 2 ))' \
+	': $(: $(( 1 << 2 )) )' \
+	': $( (( 1 << 2 )) )' \
+	': $( : $( (( 1 << 2 )) ) )' \
+	': $( (( $( (( 1 << 2 )); echo 1 ) << 2 )) )' \
+	': $( typeset -a arr=((a $(( 1 << 2 )) c) 1) )' \
+	'typeset -Ca arr=((a=ah b=beh c=si))' \
+	': $( typeset -Ca arr=((a=ah b=beh c=si)) )' \
+	'r=${ typeset -Ca arr=((a=ah b=beh c=si)); }' \
+	'set --noexec; : $( typeset -a arr=((a $(( $( typeset -a barr=((a $(( 1 << 2 )) c) 1); echo 1 ) << $( typeset -a bazz=((a $(( 1 << 2 )) c) 1); echo 2 ) )) c) 1) )' \
+	'set --noexec; r=$(typeset -C arr=( (a=ah b=beh c=si) 1 (e f g)));'
+do
+	# fork comsub with 'ulimit' on old ksh to avoid a fixed lexer bug crashing the entire test script
+	got=$(let ".sh.version >= 20211209" || ulimit -c 0
+		eval "set +x; $testcode" 2>&1) \
+	|| err_exit "comsub/arithexp lexing test $(printf %q "$testcode"):" \
+		"got status $? and $(printf %q "$got")"
+done
+unset testcode
+
 # ======
 exit $((Errors<125?Errors:125))