recursive parsing for double quotes, subshells, and variable expansions

2025-04-15 20:17:15 +02:00 · 2022-11-16 00:37:22 -08:00 · 2022-11-16 00:37:22 -08:00 · f1958eaac7
commit f1958eaac7
parent d44242fe71
3 changed files with 274 additions and 38 deletions
--- a/pkg/shparse/shparse.go
+++ b/pkg/shparse/shparse.go
@ -81,6 +81,8 @@ const (
 	WordTypeOp        = "op"   // single: & ; | ( ) < > \n  multi(2): && || ;; << >> <& >& <> >| ((  multi(3): <<-    ('((' requires special processing)
 	WordTypeKey       = "key"  // if then else elif fi do done case esac while until for in { } ! (( [[
 	WordTypeSimpleVar = "svar" // simplevar $
+	WordTypeGroup     = "grp"  // contains other words e.g. "hello"foo'bar'$x
+	WordTypeArith     = "ath"

 	// each of these can also be used as an entry in quoteContext
 	WordTypeDQ       = "dq"   // "
@ -129,11 +131,10 @@ type wordType struct {
 }

 func (c *parseContext) clone(pos int, newQuote string) *parseContext {
-	rtn := *c
+	rtn := parseContext{Input: c.Input[pos:], QC: c.QC}
 	if newQuote != "" {
-		rtn.QC = append(rtn.QC, newQuote)
+		rtn.QC = rtn.QC.push(newQuote)
 	}
-	rtn.Input = rtn.Input[pos:]
 	return &rtn
 }

@ -264,12 +265,15 @@ func (c *parseContext) parseStrDQ() *wordType {
 	if !c.match('"') {
 		return nil
 	}
-	newOffset, complete := c.skipToChar(1, '"', false)
+	newContext := c.clone(c.Pos+1, WordTypeDQ)
+	subWords, eofExit := newContext.tokenizeDQ()
+	newOffset := newContext.Pos + 1
 	w := &wordType{
 		Type:     WordTypeDQ,
 		Offset:   c.Pos,
 		Raw:      c.Input[c.Pos : c.Pos+newOffset],
-		Complete: complete,
+		Complete: !eofExit,
+		Subs:     subWords,
 	}
 	c.Pos = c.Pos + newOffset
 	return w
@ -320,6 +324,19 @@ func (c *parseContext) parseStrDDQ() *wordType {
 	return w
 }

+func (c *parseContext) parseArith(mustComplete bool) *wordType {
+	if !c.match2('(', '(') {
+		return nil
+	}
+	newOffset, complete := c.skipToChar2(2, ')', ')', false)
+	if mustComplete && !complete {
+		return nil
+	}
+	w := &wordType{Type: WordTypeArith, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: complete}
+	c.Pos = c.Pos + newOffset
+	return w
+}
+
 func (c *parseContext) parseExpansion() *wordType {
 	if !c.match('$') {
 		return nil
@ -332,8 +349,12 @@ func (c *parseContext) parseExpansion() *wordType {
 	}
 	if c.match2('$', '(') {
 		// subshell
-		newOffset, complete := c.skipToChar(2, ')', false)
-		w := &wordType{Type: WordTypeDP, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: complete}
+		newContext := c.clone(c.Pos+2, WordTypeDP)
+		subWords, eofExit := newContext.tokenizeRaw()
+		newOffset := newContext.Pos + 2
+		// newOffset, complete := c.skipToChar(2, ')', false)
+		w := &wordType{Type: WordTypeDP, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: !eofExit}
+		w.Subs = subWords
 		c.Pos = c.Pos + newOffset
 		return w
 	}
@ -346,8 +367,10 @@ func (c *parseContext) parseExpansion() *wordType {
 	}
 	if c.match2('$', '{') {
 		// variable expansion
-		newOffset, complete := c.skipToChar(2, '}', false)
-		w := &wordType{Type: WordTypeVarBrace, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: complete}
+		newContext := c.clone(c.Pos+2, WordTypeVarBrace)
+		_, eofExit := newContext.tokenizeVarBrace()
+		newOffset := newContext.Pos + 2
+		w := &wordType{Type: WordTypeVarBrace, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: !eofExit}
 		c.Pos = c.Pos + newOffset
 		return w
 	}
@ -520,7 +543,7 @@ func (w *wordType) String() string {
 	if !w.Complete {
 		notCompleteFlag = "*"
 	}
-	return fmt.Sprintf("%4s[%3d]%s %s%q", w.Type, w.Offset, notCompleteFlag, makeSpaceStr(len(w.Prefix)), string(w.Raw))
+	return fmt.Sprintf("%4s[%3d]%s %s%q", w.Type, w.Offset, notCompleteFlag, makeSpaceStr(len(w.Prefix)), string(w.FullRawString()))
 }

 func dumpWords(words []*wordType, indentStr string) {
--- a/pkg/shparse/shparse_test.go
+++ b/pkg/shparse/shparse_test.go
@ -35,4 +35,12 @@ func Test1(t *testing.T) {
 	testParse(t, `ls ${x:"hello"} $[2+2] $((5 * 10)) $(ls; ls&)`)
 	testParse(t, `ls;ls&./foo > out 2> "out2"`)
 	testParse(t, `(( x = 5)); ls& cd ~/work/"hello again"`)
+	testParse(t, `echo "hello"abc$(ls)$x${y:foo}`)
+	testParse(t, `echo $(ls; ./x "foo")`)
+	testParse(t, `echo $(ls; (cd foo; ls); (cd bar; ls))xyz`)
+	testParse(t, `echo "$x ${y:-foo}"`)
+	testParse(t, `command="$(echo "$input" | sed -e "s/^[ \t]*\([^ \t]*\)[ \t]*.*$/\1/g")"`)
+	testParse(t, `echo $(ls $)`)
+	testParse(t, `echo ${x:-hello\}"}"} 2nd`)
+	testParse(t, `echo "$(ls "foo") more $x"`)
 }
--- a/pkg/shparse/tokenize.go
+++ b/pkg/shparse/tokenize.go
@ -2,6 +2,7 @@ package shparse

 import (
 	"bytes"
+	"fmt"
 	"unicode"
 )

@ -16,7 +17,8 @@ type tokenizeOutputState struct {
 	SavedPrefix []rune
 }

-func (state *tokenizeOutputState) appendWord(word *wordType) {
+// does not set CurWord
+func (state *tokenizeOutputState) appendStandaloneWord(word *wordType) {
 	state.delimitCurWord()
 	if len(state.SavedPrefix) > 0 {
 		word.Prefix = state.SavedPrefix
@ -25,12 +27,72 @@ func (state *tokenizeOutputState) appendWord(word *wordType) {
 	state.Rtn = append(state.Rtn, word)
 }

-func (state *tokenizeOutputState) ensureCurWord(pc *parseContext) {
-	if state.CurWord != nil {
+func (state *tokenizeOutputState) appendWord(word *wordType) {
+	if len(state.SavedPrefix) > 0 {
+		word.Prefix = state.SavedPrefix
+		state.SavedPrefix = nil
+	}
+	if state.CurWord == nil {
+		state.CurWord = word
 		return
 	}
-	state.CurWord = &wordType{Type: WordTypeLit, Offset: pc.Pos, Complete: true, Prefix: state.SavedPrefix}
-	state.SavedPrefix = nil
+	state.ensureGroupWord()
+	state.CurWord.Subs = append(state.CurWord.Subs, word)
+}
+
+func (state *tokenizeOutputState) ensureGroupWord() {
+	if state.CurWord == nil {
+		panic("invalid state, cannot make group word when CurWord is nil")
+	}
+	if state.CurWord.Type == WordTypeGroup {
+		return
+	}
+	// moves the prefix from CurWord to the new group word
+	groupWord := &wordType{
+		Type:     WordTypeGroup,
+		Offset:   state.CurWord.Offset,
+		Complete: true,
+		Prefix:   state.CurWord.Prefix,
+	}
+	state.CurWord.Prefix = nil
+	groupWord.Subs = []*wordType{state.CurWord}
+	state.CurWord = groupWord
+}
+
+func ungroupWord(w *wordType) []*wordType {
+	if w.Type != WordTypeGroup {
+		return []*wordType{w}
+	}
+	rtn := w.Subs
+	if len(w.Prefix) > 0 && len(rtn) > 0 {
+		newPrefix := append([]rune{}, w.Prefix...)
+		newPrefix = append(newPrefix, rtn[0].Prefix...)
+		rtn[0].Prefix = newPrefix
+	}
+	return rtn
+}
+
+func (state *tokenizeOutputState) ensureLitCurWord(pc *parseContext) {
+	if state.CurWord == nil {
+		state.CurWord = &wordType{Type: WordTypeLit, Offset: pc.Pos, Complete: true, Prefix: state.SavedPrefix}
+		state.SavedPrefix = nil
+		return
+	}
+	if state.CurWord.Type == WordTypeLit {
+		return
+	}
+	state.ensureGroupWord()
+	lastWord := state.CurWord.Subs[len(state.CurWord.Subs)-1]
+	if lastWord.Type != WordTypeLit {
+		if len(state.SavedPrefix) > 0 {
+			dumpWords(state.Rtn, "**")
+			dumpWords([]*wordType{state.CurWord}, ">>")
+			fmt.Printf("sp: %q\n", state.SavedPrefix)
+			panic("invalid state, there can be no saved prefix")
+		}
+		litWord := &wordType{Type: WordTypeLit, Offset: pc.Pos, Complete: true}
+		state.CurWord.Subs = append(state.CurWord.Subs, litWord)
+	}
 }

 func (state *tokenizeOutputState) delimitCurWord() {
@ -45,40 +107,51 @@ func (state *tokenizeOutputState) delimitWithSpace(spaceCh rune) {
 	state.SavedPrefix = append(state.SavedPrefix, spaceCh)
 }

+func (state *tokenizeOutputState) appendLiteral(pc *parseContext, ch rune) {
+	state.ensureLitCurWord(pc)
+	if state.CurWord.Type == WordTypeLit {
+		state.CurWord.Raw = append(state.CurWord.Raw, ch)
+	} else if state.CurWord.Type == WordTypeGroup {
+		lastWord := state.CurWord.Subs[len(state.CurWord.Subs)-1]
+		if lastWord.Type != WordTypeLit {
+			panic(fmt.Sprintf("invalid curword type (group) %q", state.CurWord.Type))
+		}
+		lastWord.Raw = append(lastWord.Raw, ch)
+	} else {
+		panic(fmt.Sprintf("invalid curword type %q", state.CurWord.Type))
+	}
+}
+
 func (state *tokenizeOutputState) finish(pc *parseContext) {
 	state.delimitCurWord()
 	if len(state.SavedPrefix) > 0 {
-		state.ensureCurWord(pc)
+		state.ensureLitCurWord(pc)
 		state.delimitCurWord()
 	}
 }

-func Tokenize(cmd string) []*wordType {
-	c := &parseContext{Input: []rune(cmd)}
+func (c *parseContext) tokenizeVarBrace() ([]*wordType, bool) {
 	state := &tokenizeOutputState{}
+	eofExit := false
 	for {
 		ch := c.cur()
 		if ch == 0 {
+			eofExit = true
 			break
 		}
-		// fmt.Printf("ch %d %q\n", c.Pos, string([]rune{ch}))
-		foundOp, newOffset := c.parseOp(0)
-		if foundOp {
-			opWord := &wordType{Type: WordTypeOp, Offset: c.Pos, Raw: c.Input[c.Pos : c.Pos+newOffset], Complete: true}
-			opWord.Val = string(opWord.Raw)
-			c.Pos = c.Pos + newOffset
-			state.appendWord(opWord)
-			continue
+		if ch == '}' {
+			c.Pos++
+			break
 		}
 		var quoteWord *wordType
-		switch ch {
-		case '\'':
+		if ch == '\'' {
 			quoteWord = c.parseStrSQ()
-
-		case '"':
+		}
+		if quoteWord == nil && ch == '"' {
 			quoteWord = c.parseStrDQ()
-
-		case '$':
+		}
+		isNextBrace := c.at(1) == '}'
+		if quoteWord == nil && ch == '$' && !isNextBrace {
 			quoteWord = c.parseStrANSI()
 			if quoteWord == nil {
 				quoteWord = c.parseStrDDQ()
@ -92,8 +165,124 @@ func Tokenize(cmd string) []*wordType {
 			continue
 		}
 		if ch == '\\' && c.at(1) != 0 {
-			state.ensureCurWord(c)
-			state.CurWord.Raw = append(state.CurWord.Raw, ch, c.at(1))
+			state.appendLiteral(c, ch)
+			state.appendLiteral(c, c.at(1))
+			c.Pos += 2
+			continue
+		}
+		state.appendLiteral(c, ch)
+		c.Pos++
+	}
+	return state.Rtn, eofExit
+}
+
+func (c *parseContext) tokenizeDQ() ([]*wordType, bool) {
+	state := &tokenizeOutputState{}
+	eofExit := false
+	for {
+		ch := c.cur()
+		if ch == 0 {
+			eofExit = true
+			break
+		}
+		if ch == '"' {
+			c.Pos++
+			break
+		}
+		if ch == '$' && c.at(1) != 0 {
+			quoteWord := c.parseStrANSI()
+			if quoteWord == nil {
+				quoteWord = c.parseStrDDQ()
+			}
+			if quoteWord == nil {
+				quoteWord = c.parseExpansion()
+			}
+			if quoteWord != nil {
+				state.appendWord(quoteWord)
+				continue
+			}
+		}
+		if ch == '\\' && c.at(1) != 0 {
+			state.appendLiteral(c, ch)
+			state.appendLiteral(c, c.at(1))
+			c.Pos += 2
+			continue
+		}
+		state.appendLiteral(c, ch)
+		c.Pos++
+	}
+	state.finish(c)
+	if len(state.Rtn) == 0 {
+		return nil, eofExit
+	}
+	if len(state.Rtn) == 1 && state.Rtn[0].Type == WordTypeGroup {
+		return ungroupWord(state.Rtn[0]), eofExit
+	}
+	return state.Rtn, eofExit
+}
+
+// returns (words, eofexit)
+func (c *parseContext) tokenizeRaw() ([]*wordType, bool) {
+	state := &tokenizeOutputState{}
+	isExpSubShell := c.QC.cur() == WordTypeDP
+	parenLevel := 0
+	eofExit := false
+	for {
+		ch := c.cur()
+		if ch == 0 {
+			eofExit = true
+			break
+		}
+		if isExpSubShell && ch == ')' && parenLevel == 0 {
+			c.Pos++
+			break
+		}
+		// fmt.Printf("ch %d %q\n", c.Pos, string([]rune{ch}))
+		foundOp, newOffset := c.parseOp(0)
+		if foundOp {
+			rawOp := c.Input[c.Pos : c.Pos+newOffset]
+			opVal := string(rawOp)
+			opWord := &wordType{Type: WordTypeOp, Offset: c.Pos, Raw: rawOp, Val: opVal, Complete: true}
+			if opWord.Val == "(" {
+				arithWord := c.parseArith(true)
+				if arithWord != nil {
+					state.appendStandaloneWord(arithWord)
+					continue
+				} else {
+					parenLevel++
+				}
+			}
+			if opWord.Val == ")" {
+				parenLevel--
+			}
+			c.Pos = c.Pos + newOffset
+			state.appendStandaloneWord(opWord)
+			continue
+		}
+		var quoteWord *wordType
+		if ch == '\'' {
+			quoteWord = c.parseStrSQ()
+		}
+		if quoteWord == nil && ch == '"' {
+			quoteWord = c.parseStrDQ()
+		}
+		isNextParen := isExpSubShell && c.at(1) == ')'
+		if quoteWord == nil && ch == '$' && !isNextParen {
+			quoteWord = c.parseStrANSI()
+			if quoteWord == nil {
+				quoteWord = c.parseStrDDQ()
+			}
+			if quoteWord == nil {
+				quoteWord = c.parseExpansion()
+			}
+		}
+		if quoteWord != nil {
+			state.appendWord(quoteWord)
+			continue
+		}
+		if ch == '\\' && c.at(1) != 0 {
+			state.appendLiteral(c, ch)
+			state.appendLiteral(c, c.at(1))
 			c.Pos += 2
 			continue
 		}
@ -102,12 +291,28 @@ func Tokenize(cmd string) []*wordType {
 			c.Pos++
 			continue
 		}
-		state.ensureCurWord(c)
-		state.CurWord.Raw = append(state.CurWord.Raw, ch)
+		state.appendLiteral(c, ch)
 		c.Pos++
 	}
 	state.finish(c)
-	return state.Rtn
+	return state.Rtn, eofExit
+}
+
+func Tokenize(cmd string) []*wordType {
+	c := &parseContext{Input: []rune(cmd)}
+	rtn, _ := c.tokenizeRaw()
+	return rtn
+}
+
+func (w *wordType) FullRawString() []rune {
+	if w.Type == WordTypeGroup {
+		var rtn []rune
+		for _, sw := range w.Subs {
+			rtn = append(rtn, sw.FullRawString()...)
+		}
+		return rtn
+	}
+	return w.Raw
 }

 func wordsToStr(words []*wordType) string {
@ -116,7 +321,7 @@ func wordsToStr(words []*wordType) string {
 		if len(word.Prefix) > 0 {
 			buf.WriteString(string(word.Prefix))
 		}
-		buf.WriteString(string(word.Raw))
+		buf.WriteString(string(word.FullRawString()))
 	}
 	return buf.String()
 }