waveterm/pkg/shparse/shparse.go

package shparse

import (
	"fmt"
	"unicode"
)

//
// cmds := cmd (sep cmd)*
// sep := ';' | '&' | '&&' | '||' | '|' | '\n'
// cmd := simple-cmd | compound-command redirect-list?
// compound-command := brace-group | subshell | for-clause | case-clause | if-clause | while-clause | until-clause
// brace-group := '{' cmds '}'
// subshell := '(' cmds ')'
// simple-command := cmd-prefix cmd-word (io-redirect)*
// cmd-prefix := (io-redirect | assignment)*
// cmd-suffix := (io-redirect | word)*
// cmd-name := word
// cmd-word := word
// io-redirect := (io-number? io-file) | (io-number? io-here)
// io-file := ('<' | '<&' | '>' | '>&' | '>>' | '>|' ) filename
// io-here := ('<<' | '<<-') here_end
// here-end := word
// if-clause := 'if' compound-list 'then' compound-list else-part 'fi'
// else-part :=   'elif' compound-list 'then' compound-list
//              | 'elif' compount-list 'then' compound-list else-part
//              | 'else' compound-list
// compound-list := linebreak term sep?
//
//
//
// $var
// ${var}
// ${var op word?}
// op := '-' | '=' | '?' | '+' | ':-' | ':=' | ':?' | ':+' | '%' | '%%' | '#' | '##'
// ${ '#' var }
//
// $(command)
// `command`
// $(( arith ))
//
// " ... "
// ' ... '
// $' ... '
// $" ... '

// "  => $, ", `, \
// '  => '
// (process quotes)
// mark as escaped
// split into commands (use ';' as separator)
// parse special operators
// perform expansions (vars, globs, commands)
// split command into name and arguments

// A correctly-formed brace expansion must contain unquoted opening and closing braces, and at least one unquoted comma or a valid sequence expression
// Any incorrectly formed brace expansion is left unchanged.

// word: char *word; flags
// bash aliases are lexical

// [[, ((, $(( <- DQ

// $ -> expansion
// $(...)
// (...)
// $((...))
// ((...))
// ${...}
// {...}
// X=(...)

// ambiguity between $((...)) and $((ls); ls)
// ambiguity between foo=([0]=hell) and foo=([abc)

// tokenization https://pubs.opengroup.org/onlinepubs/7908799/xcu/chap2.html#tag_001_003

const (
	WordTypeRaw       = "raw"
	WordTypeLit       = "lit"
	WordTypeOp        = "op"   // single: & ; | ( ) < > \n  multi(2): && || ;; << >> <& >& <> >| ((  multi(3): <<-    ('((' requires special processing)
	WordTypeKey       = "key"  // if then else elif fi do done case esac while until for in { } ! (( [[
	WordTypeSimpleVar = "svar" // simplevar $
	WordTypeGroup     = "grp"  // contains other words e.g. "hello"foo'bar'$x

	WordTypeDQ       = "dq"   // "    (quote-context)
	WordTypeDDQ      = "ddq"  // $"   (quote-context)
	WordTypeVarBrace = "varb" // ${   (quote-context)
	WordTypeDP       = "dp"   // $(   (quote-context)
	WordTypeBQ       = "bq"   // `    (quote-context)

	WordTypeSQ  = "sq"  // '
	WordTypeDSQ = "dsq" // $'
	WordTypeDPP = "dpp" // $((   (internals not parsed)
	WordTypePP  = "pp"  // ((    (internals not parsed)
	WordTypeDB  = "db"  // $[    (internals not parsed)
)

type quoteContext []string

func (qc quoteContext) push(q string) quoteContext {
	rtn := make([]string, 0, len(qc)+1)
	rtn = append(rtn, qc...)
	rtn = append(rtn, q)
	return rtn
}

func (qc quoteContext) cur() string {
	if len(qc) == 0 {
		return ""
	}
	return qc[len(qc)-1]
}

type parseContext struct {
	Input []rune
	Pos   int
	QC    quoteContext
}

type wordType struct {
	Type     string
	Offset   int
	QC       quoteContext
	Raw      []rune
	Complete bool
	Val      string // only for Op and Key (does *not* store string values of quoted expressions or expansions)
	Prefix   []rune
	Subs     []*wordType
}

func (c *parseContext) clone(pos int, newQuote string) *parseContext {
	rtn := parseContext{Input: c.Input[pos:], QC: c.QC}
	if newQuote != "" {
		rtn.QC = rtn.QC.push(newQuote)
	}
	return &rtn
}

func (c *parseContext) at(offset int) rune {
	pos := c.Pos + offset
	if pos < 0 || pos >= len(c.Input) {
		return 0
	}
	return c.Input[pos]
}

func (c *parseContext) eof() bool {
	return c.Pos >= len(c.Input)
}

func (c *parseContext) cur() rune {
	return c.at(0)
}

func (c *parseContext) match(ch rune) bool {
	return c.at(0) == ch
}

func (c *parseContext) match2(ch rune, ch2 rune) bool {
	return c.at(0) == ch && c.at(1) == ch2
}

func (c *parseContext) match3(ch rune, ch2 rune, ch3 rune) bool {
	return c.at(0) == ch && c.at(1) == ch2 && c.at(2) == ch3
}

func (c *parseContext) makeWord(t string, length int, complete bool) *wordType {
	rtn := &wordType{Type: t}
	rtn.Offset = c.Pos
	rtn.QC = c.QC
	rtn.Raw = c.Input[c.Pos : c.Pos+length]
	rtn.Complete = complete
	c.Pos += length
	return rtn
}

// returns (found, newOffset)
// shell_meta_chars "()<>;&|"
// possible to maybe add ;;& &>> &> |& ;&
func (c *parseContext) parseOp(offset int) (bool, int) {
	ch := c.at(offset)
	if ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == ';' || ch == '&' || ch == '|' {
		ch2 := c.at(offset + 1)
		if ch2 == 0 {
			return true, offset + 1
		}
		r2 := string([]rune{ch, ch2})
		if r2 == "<<" {
			ch3 := c.at(offset + 2)
			if ch3 == '-' || ch3 == '<' {
				return true, offset + 3 // "<<-" or "<<<"
			}
			return true, offset + 2 // "<<"
		}
		if r2 == ">>" || r2 == "&&" || r2 == "||" || r2 == ";;" || r2 == "<<" || r2 == "<&" || r2 == ">&" || r2 == "<>" || r2 == ">|" {
			// we don't return '((' here (requires special processing)
			return true, offset + 2
		}
		return true, offset + 1
	}
	return false, 0
}

// returns (new-offset, complete)
func (c *parseContext) skipToChar(offset int, endCh rune, allowEsc bool) (int, bool) {
	for {
		ch := c.at(offset)
		if ch == 0 {
			return offset, false
		}
		if allowEsc && ch == '\\' {
			if c.at(offset+1) == 0 {
				return offset + 1, false
			}
			offset += 2
			continue
		}
		if ch == endCh {
			return offset + 1, true
		}
		offset++
	}
}

// returns (new-offset, complete)
func (c *parseContext) skipToChar2(offset int, endCh rune, endCh2 rune, allowEsc bool) (int, bool) {
	for {
		ch := c.at(offset)
		ch2 := c.at(offset + 1)
		if ch == 0 {
			return offset, false
		}
		if ch2 == 0 {
			return offset + 1, false
		}
		if allowEsc && ch == '\\' {
			offset += 2
			continue
		}
		if ch == endCh && ch2 == endCh2 {
			return offset + 2, true
		}
		offset++
	}
}

func (c *parseContext) parseStrSQ() *wordType {
	if !c.match('\'') {
		return nil
	}
	newOffset, complete := c.skipToChar(1, '\'', false)
	w := c.makeWord(WordTypeSQ, newOffset, complete)
	return w
}

func (c *parseContext) parseStrDQ() *wordType {
	if !c.match('"') {
		return nil
	}
	newContext := c.clone(c.Pos+1, WordTypeDQ)
	subWords, eofExit := newContext.tokenizeDQ()
	newOffset := newContext.Pos + 1
	w := c.makeWord(WordTypeDQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrDDQ() *wordType {
	if !c.match2('$', '"') {
		return nil
	}
	newContext := c.clone(c.Pos+2, WordTypeDDQ)
	subWords, eofExit := newContext.tokenizeDQ()
	newOffset := newContext.Pos + 2
	w := c.makeWord(WordTypeDDQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrBQ() *wordType {
	if !c.match('`') {
		return nil
	}
	newContext := c.clone(c.Pos+1, WordTypeBQ)
	subWords, eofExit := newContext.tokenizeRaw()
	newOffset := newContext.Pos + 1
	w := c.makeWord(WordTypeBQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrANSI() *wordType {
	if !c.match2('$', '\'') {
		return nil
	}
	newOffset, complete := c.skipToChar(2, '\'', true)
	w := c.makeWord(WordTypeDSQ, newOffset, complete)
	return w
}

func (c *parseContext) parseArith(mustComplete bool) *wordType {
	if !c.match2('(', '(') {
		return nil
	}
	newOffset, complete := c.skipToChar2(2, ')', ')', false)
	if mustComplete && !complete {
		return nil
	}
	w := c.makeWord(WordTypePP, newOffset, complete)
	return w
}

func (c *parseContext) parseExpansion() *wordType {
	if !c.match('$') {
		return nil
	}
	if c.match3('$', '(', '(') {
		newOffset, complete := c.skipToChar2(3, ')', ')', false)
		w := c.makeWord(WordTypeDPP, newOffset, complete)
		return w
	}
	if c.match2('$', '(') {
		// subshell
		newContext := c.clone(c.Pos+2, WordTypeDP)
		subWords, eofExit := newContext.tokenizeRaw()
		newOffset := newContext.Pos + 2
		w := c.makeWord(WordTypeDP, newOffset, !eofExit)
		w.Subs = subWords
		return w
	}
	if c.match2('$', '[') {
		// deprecated arith expansion
		newOffset, complete := c.skipToChar(2, ']', false)
		w := c.makeWord(WordTypeDB, newOffset, complete)
		return w
	}
	if c.match2('$', '{') {
		// variable expansion
		newContext := c.clone(c.Pos+2, WordTypeVarBrace)
		_, eofExit := newContext.tokenizeVarBrace()
		newOffset := newContext.Pos + 2
		w := c.makeWord(WordTypeVarBrace, newOffset, !eofExit)
		return w
	}
	ch2 := c.at(1)
	if ch2 == 0 || unicode.IsSpace(ch2) {
		// no expansion
		return nil
	}
	newOffset := c.parseSimpleVarName(1)
	if newOffset > 1 {
		// simple variable name
		w := c.makeWord(WordTypeSimpleVar, newOffset, true)
		return w
	}
	if ch2 == '*' || ch2 == '@' || ch2 == '#' || ch2 == '?' || ch2 == '-' || ch2 == '$' || ch2 == '!' || (ch2 >= '0' && ch2 <= '9') {
		// single character variable name, e.g. $@, $_, $1, etc.
		w := c.makeWord(WordTypeSimpleVar, 2, true)
		return w
	}
	return nil
}

// returns newOffset
func (c *parseContext) parseSimpleVarName(offset int) int {
	first := true
	for {
		ch := c.at(offset)
		if ch == 0 {
			return offset
		}
		if (ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) || (!first && ch >= '0' && ch <= '9') {
			first = false
			offset++
			continue
		}
		return offset
	}
}

func makeSpaceStr(slen int) string {
	if slen == 0 {
		return ""
	}
	if slen == 1 {
		return " "
	}
	rtn := make([]byte, slen)
	for i := 0; i < slen; i++ {
		rtn[i] = ' '
	}
	return string(rtn)
}

func (w *wordType) String() string {
	notCompleteFlag := " "
	if !w.Complete {
		notCompleteFlag = "*"
	}
	return fmt.Sprintf("%4s[%3d]%s %s%q", w.Type, w.Offset, notCompleteFlag, makeSpaceStr(len(w.Prefix)), string(w.FullRawString()))
}

func dumpWords(words []*wordType, indentStr string) {
	for _, word := range words {
		fmt.Printf("%s%s\n", indentStr, word.String())
		if len(word.Subs) > 0 {
			dumpWords(word.Subs, indentStr+"  ")
		}
	}
}