waveterm/pkg/shparse/tokenize.go

package shparse

import (
	"fmt"
	"unicode"
)

// from bash source
//
// shell_meta_chars "()<>;&|"
//

type tokenizeOutputState struct {
	Rtn         []*WordType
	CurWord     *WordType
	SavedPrefix []rune
}

func copyRunes(rarr []rune) []rune {
	if len(rarr) == 0 {
		return nil
	}
	return append([]rune(nil), rarr...)
}

// does not set CurWord
func (state *tokenizeOutputState) appendStandaloneWord(word *WordType) {
	state.delimitCurWord()
	if len(state.SavedPrefix) > 0 {
		word.Prefix = state.SavedPrefix
		state.SavedPrefix = nil
	}
	state.Rtn = append(state.Rtn, word)
}

func (state *tokenizeOutputState) appendWord(word *WordType) {
	if len(state.SavedPrefix) > 0 {
		word.Prefix = state.SavedPrefix
		state.SavedPrefix = nil
	}
	if state.CurWord == nil {
		state.CurWord = word
		return
	}
	state.ensureGroupWord()
	word.Offset = word.Offset - state.CurWord.Offset
	state.CurWord.Subs = append(state.CurWord.Subs, word)
	state.CurWord.Raw = append(state.CurWord.Raw, word.Raw...)
}

func (state *tokenizeOutputState) ensureGroupWord() {
	if state.CurWord == nil {
		panic("invalid state, cannot make group word when CurWord is nil")
	}
	if state.CurWord.Type == WordTypeGroup {
		return
	}
	// moves the prefix from CurWord to the new group word, resets offsets
	groupWord := &WordType{
		Type:     WordTypeGroup,
		Offset:   state.CurWord.Offset,
		QC:       state.CurWord.QC,
		Raw:      copyRunes(state.CurWord.Raw),
		Complete: true,
		Prefix:   state.CurWord.Prefix,
	}
	state.CurWord.Prefix = nil
	state.CurWord.Offset = 0
	groupWord.Subs = []*WordType{state.CurWord}
	state.CurWord = groupWord
}

func ungroupWord(groupWord *WordType) []*WordType {
	if groupWord.Type != WordTypeGroup {
		return []*WordType{groupWord}
	}
	rtn := groupWord.Subs
	if len(groupWord.Prefix) > 0 && len(rtn) > 0 {
		newPrefix := append([]rune{}, groupWord.Prefix...)
		newPrefix = append(newPrefix, rtn[0].Prefix...)
		rtn[0].Prefix = newPrefix
	}
	for _, word := range rtn {
		word.Offset = word.Offset + groupWord.Offset
	}
	return rtn
}

func (state *tokenizeOutputState) ensureLitCurWord(pc *parseContext) {
	if state.CurWord == nil {
		state.CurWord = pc.makeWord(WordTypeLit, 0, true)
		state.CurWord.Prefix = state.SavedPrefix
		state.SavedPrefix = nil
		return
	}
	if state.CurWord.Type == WordTypeLit {
		return
	}
	state.ensureGroupWord()
	lastWord := state.CurWord.Subs[len(state.CurWord.Subs)-1]
	if lastWord.Type != WordTypeLit {
		if len(state.SavedPrefix) > 0 {
			panic("invalid state, there can be no saved prefix")
		}
		litWord := pc.makeWord(WordTypeLit, 0, true)
		litWord.Offset = litWord.Offset - state.CurWord.Offset
		state.CurWord.Subs = append(state.CurWord.Subs, litWord)
	}
}

func (state *tokenizeOutputState) delimitCurWord() {
	if state.CurWord != nil {
		state.Rtn = append(state.Rtn, state.CurWord)
		state.CurWord = nil
	}
}

func (state *tokenizeOutputState) delimitWithSpace(spaceCh rune) {
	state.delimitCurWord()
	state.SavedPrefix = append(state.SavedPrefix, spaceCh)
}

func (state *tokenizeOutputState) appendLiteral(pc *parseContext, ch rune) {
	state.ensureLitCurWord(pc)
	if state.CurWord.Type == WordTypeLit {
		state.CurWord.Raw = append(state.CurWord.Raw, ch)
	} else if state.CurWord.Type == WordTypeGroup {
		lastWord := state.CurWord.Subs[len(state.CurWord.Subs)-1]
		if lastWord.Type != WordTypeLit {
			panic(fmt.Sprintf("invalid curword type (group) %q", state.CurWord.Type))
		}
		lastWord.Raw = append(lastWord.Raw, ch)
		state.CurWord.Raw = append(state.CurWord.Raw, ch)
	} else {
		panic(fmt.Sprintf("invalid curword type %q", state.CurWord.Type))
	}
}

func (state *tokenizeOutputState) finish(pc *parseContext) {
	state.delimitCurWord()
	if len(state.SavedPrefix) > 0 {
		state.ensureLitCurWord(pc)
		state.delimitCurWord()
	}
}

func (c *parseContext) tokenizeVarBrace() ([]*WordType, bool) {
	state := &tokenizeOutputState{}
	eofExit := false
	for {
		ch := c.cur()
		if ch == 0 {
			eofExit = true
			break
		}
		if ch == '}' {
			c.Pos++
			break
		}
		var quoteWord *WordType
		if ch == '\'' {
			quoteWord = c.parseStrSQ()
		}
		if quoteWord == nil && ch == '"' {
			quoteWord = c.parseStrDQ()
		}
		isNextBrace := c.at(1) == '}'
		if quoteWord == nil && ch == '$' && !isNextBrace {
			quoteWord = c.parseStrANSI()
			if quoteWord == nil {
				quoteWord = c.parseStrDDQ()
			}
			if quoteWord == nil {
				quoteWord = c.parseExpansion()
			}
		}
		if quoteWord != nil {
			state.appendWord(quoteWord)
			continue
		}
		if ch == '\\' && c.at(1) != 0 {
			state.appendLiteral(c, ch)
			state.appendLiteral(c, c.at(1))
			c.Pos += 2
			continue
		}
		state.appendLiteral(c, ch)
		c.Pos++
	}
	return state.Rtn, eofExit
}

func (c *parseContext) tokenizeDQ() ([]*WordType, bool) {
	state := &tokenizeOutputState{}
	eofExit := false
	for {
		ch := c.cur()
		if ch == 0 {
			eofExit = true
			break
		}
		if ch == '"' {
			c.Pos++
			break
		}
		if ch == '$' && c.at(1) != 0 {
			quoteWord := c.parseStrANSI()
			if quoteWord == nil {
				quoteWord = c.parseStrDDQ()
			}
			if quoteWord == nil {
				quoteWord = c.parseExpansion()
			}
			if quoteWord != nil {
				state.appendWord(quoteWord)
				continue
			}
		}
		if ch == '\\' && c.at(1) != 0 {
			state.appendLiteral(c, ch)
			state.appendLiteral(c, c.at(1))
			c.Pos += 2
			continue
		}
		state.appendLiteral(c, ch)
		c.Pos++
	}
	state.finish(c)
	if len(state.Rtn) == 0 {
		return nil, eofExit
	}
	if len(state.Rtn) == 1 && state.Rtn[0].Type == WordTypeGroup {
		return ungroupWord(state.Rtn[0]), eofExit
	}
	return state.Rtn, eofExit
}

// returns (words, eofexit)
// backticks (WordTypeBQ) handle backslash in a special way, but that seems to mainly effect execution (not completion)
//     de_backslash => removes initial backslash in \`, \\, and \$ before execution
func (c *parseContext) tokenizeRaw() ([]*WordType, bool) {
	state := &tokenizeOutputState{}
	isExpSubShell := c.QC.cur() == WordTypeDP
	isInBQ := c.QC.cur() == WordTypeBQ
	parenLevel := 0
	eofExit := false
	for {
		ch := c.cur()
		if ch == 0 {
			eofExit = true
			break
		}
		if isExpSubShell && ch == ')' && parenLevel == 0 {
			c.Pos++
			break
		}
		if isInBQ && ch == '`' {
			c.Pos++
			break
		}
		// fmt.Printf("ch %d %q\n", c.Pos, string([]rune{ch}))
		foundOp, newOffset := c.parseOp(0)
		if foundOp {
			opVal := string(c.Input[c.Pos : c.Pos+newOffset])
			if opVal == "(" {
				arithWord := c.parseArith(true)
				if arithWord != nil {
					state.appendStandaloneWord(arithWord)
					continue
				} else {
					parenLevel++
				}
			}
			if opVal == ")" {
				parenLevel--
			}
			opWord := c.makeWord(WordTypeOp, newOffset, true)
			state.appendStandaloneWord(opWord)
			continue
		}
		var quoteWord *WordType
		if ch == '\'' {
			quoteWord = c.parseStrSQ()
		}
		if quoteWord == nil && ch == '"' {
			quoteWord = c.parseStrDQ()
		}
		if quoteWord == nil && ch == '`' {
			quoteWord = c.parseStrBQ()
		}
		isNextParen := isExpSubShell && c.at(1) == ')'
		if quoteWord == nil && ch == '$' && !isNextParen {
			quoteWord = c.parseStrANSI()
			if quoteWord == nil {
				quoteWord = c.parseStrDDQ()
			}
			if quoteWord == nil {
				quoteWord = c.parseExpansion()
			}
		}
		if quoteWord != nil {
			state.appendWord(quoteWord)
			continue
		}
		if ch == '\\' && c.at(1) != 0 {
			state.appendLiteral(c, ch)
			state.appendLiteral(c, c.at(1))
			c.Pos += 2
			continue
		}
		if ch == '\n' {
			newlineWord := c.makeWord(WordTypeOp, 1, true)
			state.appendStandaloneWord(newlineWord)
			continue
		}
		if unicode.IsSpace(ch) {
			state.delimitWithSpace(ch)
			c.Pos++
			continue
		}
		state.appendLiteral(c, ch)
		c.Pos++
	}
	state.finish(c)
	return state.Rtn, eofExit
}

type parseContext struct {
	Input []rune
	Pos   int
	QC    QuoteContext
}

func (c *parseContext) clone(pos int, newQuote string) *parseContext {
	rtn := parseContext{Input: c.Input[pos:], QC: c.QC}
	if newQuote != "" {
		rtn.QC = rtn.QC.push(newQuote)
	}
	return &rtn
}

func (c *parseContext) at(offset int) rune {
	pos := c.Pos + offset
	if pos < 0 || pos >= len(c.Input) {
		return 0
	}
	return c.Input[pos]
}

func (c *parseContext) eof() bool {
	return c.Pos >= len(c.Input)
}

func (c *parseContext) cur() rune {
	return c.at(0)
}

func (c *parseContext) match(ch rune) bool {
	return c.at(0) == ch
}

func (c *parseContext) match2(ch rune, ch2 rune) bool {
	return c.at(0) == ch && c.at(1) == ch2
}

func (c *parseContext) match3(ch rune, ch2 rune, ch3 rune) bool {
	return c.at(0) == ch && c.at(1) == ch2 && c.at(2) == ch3
}

func (c *parseContext) makeWord(t string, length int, complete bool) *WordType {
	rtn := &WordType{Type: t}
	rtn.Offset = c.Pos
	rtn.QC = c.QC
	rtn.Raw = copyRunes(c.Input[c.Pos : c.Pos+length])
	rtn.Complete = complete
	c.Pos += length
	return rtn
}

// returns (found, newOffset)
// shell_meta_chars "()<>;&|"
// possible to maybe add ;;& &>> &> |& ;&
func (c *parseContext) parseOp(offset int) (bool, int) {
	ch := c.at(offset)
	if ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == ';' || ch == '&' || ch == '|' {
		ch2 := c.at(offset + 1)
		if ch2 == 0 {
			return true, offset + 1
		}
		r2 := string([]rune{ch, ch2})
		if r2 == "<<" {
			ch3 := c.at(offset + 2)
			if ch3 == '-' || ch3 == '<' {
				return true, offset + 3 // "<<-" or "<<<"
			}
			return true, offset + 2 // "<<"
		}
		if r2 == ">>" || r2 == "&&" || r2 == "||" || r2 == ";;" || r2 == "<<" || r2 == "<&" || r2 == ">&" || r2 == "<>" || r2 == ">|" {
			// we don't return '((' here (requires special processing)
			return true, offset + 2
		}
		return true, offset + 1
	}
	return false, 0
}

// returns (new-offset, complete)
func (c *parseContext) skipToChar(offset int, endCh rune, allowEsc bool) (int, bool) {
	for {
		ch := c.at(offset)
		if ch == 0 {
			return offset, false
		}
		if allowEsc && ch == '\\' {
			if c.at(offset+1) == 0 {
				return offset + 1, false
			}
			offset += 2
			continue
		}
		if ch == endCh {
			return offset + 1, true
		}
		offset++
	}
}

// returns (new-offset, complete)
func (c *parseContext) skipToChar2(offset int, endCh rune, endCh2 rune, allowEsc bool) (int, bool) {
	for {
		ch := c.at(offset)
		ch2 := c.at(offset + 1)
		if ch == 0 {
			return offset, false
		}
		if ch2 == 0 {
			return offset + 1, false
		}
		if allowEsc && ch == '\\' {
			offset += 2
			continue
		}
		if ch == endCh && ch2 == endCh2 {
			return offset + 2, true
		}
		offset++
	}
}

func (c *parseContext) parseStrSQ() *WordType {
	if !c.match('\'') {
		return nil
	}
	newOffset, complete := c.skipToChar(1, '\'', false)
	w := c.makeWord(WordTypeSQ, newOffset, complete)
	return w
}

func (c *parseContext) parseStrDQ() *WordType {
	if !c.match('"') {
		return nil
	}
	newContext := c.clone(c.Pos+1, WordTypeDQ)
	subWords, eofExit := newContext.tokenizeDQ()
	newOffset := newContext.Pos + 1
	w := c.makeWord(WordTypeDQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrDDQ() *WordType {
	if !c.match2('$', '"') {
		return nil
	}
	newContext := c.clone(c.Pos+2, WordTypeDQ) // use WordTypeDQ (not DDQ)
	subWords, eofExit := newContext.tokenizeDQ()
	newOffset := newContext.Pos + 2
	w := c.makeWord(WordTypeDDQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrBQ() *WordType {
	if !c.match('`') {
		return nil
	}
	newContext := c.clone(c.Pos+1, WordTypeBQ)
	subWords, eofExit := newContext.tokenizeRaw()
	newOffset := newContext.Pos + 1
	w := c.makeWord(WordTypeBQ, newOffset, !eofExit)
	w.Subs = subWords
	return w
}

func (c *parseContext) parseStrANSI() *WordType {
	if !c.match2('$', '\'') {
		return nil
	}
	newOffset, complete := c.skipToChar(2, '\'', true)
	w := c.makeWord(WordTypeDSQ, newOffset, complete)
	return w
}

func (c *parseContext) parseArith(mustComplete bool) *WordType {
	if !c.match2('(', '(') {
		return nil
	}
	newOffset, complete := c.skipToChar2(2, ')', ')', false)
	if mustComplete && !complete {
		return nil
	}
	w := c.makeWord(WordTypePP, newOffset, complete)
	return w
}

func (c *parseContext) parseExpansion() *WordType {
	if !c.match('$') {
		return nil
	}
	if c.match3('$', '(', '(') {
		newOffset, complete := c.skipToChar2(3, ')', ')', false)
		w := c.makeWord(WordTypeDPP, newOffset, complete)
		return w
	}
	if c.match2('$', '(') {
		// subshell
		newContext := c.clone(c.Pos+2, WordTypeDP)
		subWords, eofExit := newContext.tokenizeRaw()
		newOffset := newContext.Pos + 2
		w := c.makeWord(WordTypeDP, newOffset, !eofExit)
		w.Subs = subWords
		return w
	}
	if c.match2('$', '[') {
		// deprecated arith expansion
		newOffset, complete := c.skipToChar(2, ']', false)
		w := c.makeWord(WordTypeDB, newOffset, complete)
		return w
	}
	if c.match2('$', '{') {
		// variable expansion
		newContext := c.clone(c.Pos+2, WordTypeVarBrace)
		_, eofExit := newContext.tokenizeVarBrace()
		newOffset := newContext.Pos + 2
		w := c.makeWord(WordTypeVarBrace, newOffset, !eofExit)
		return w
	}
	ch2 := c.at(1)
	if ch2 == 0 || unicode.IsSpace(ch2) {
		// no expansion
		return nil
	}
	newOffset := c.parseSimpleVarName(1)
	if newOffset > 1 {
		// simple variable name
		w := c.makeWord(WordTypeSimpleVar, newOffset, true)
		return w
	}
	if ch2 == '*' || ch2 == '@' || ch2 == '#' || ch2 == '?' || ch2 == '-' || ch2 == '$' || ch2 == '!' || (ch2 >= '0' && ch2 <= '9') {
		// single character variable name, e.g. $@, $_, $1, etc.
		w := c.makeWord(WordTypeSimpleVar, 2, true)
		return w
	}
	return nil
}

// returns newOffset
func (c *parseContext) parseSimpleVarName(offset int) int {
	first := true
	for {
		ch := c.at(offset)
		if ch == 0 {
			return offset
		}
		if (ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) || (!first && ch >= '0' && ch <= '9') {
			first = false
			offset++
			continue
		}
		return offset
	}
}

func isSimpleVarName(rstr []rune) bool {
	if len(rstr) == 0 {
		return false
	}
	for idx, ch := range rstr {
		if (ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) || ((idx != 0) && ch >= '0' && ch <= '9') {
			continue
		}
		return false
	}
	return true
}

func Tokenize(cmd string) []*WordType {
	c := &parseContext{Input: []rune(cmd)}
	rtn, _ := c.tokenizeRaw()
	return rtn
}