655 lines
16 KiB
Go
655 lines
16 KiB
Go
package syntax
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"math"
|
|
"strconv"
|
|
)
|
|
|
|
type RegexTree struct {
|
|
root *regexNode
|
|
caps map[int]int
|
|
capnumlist []int
|
|
captop int
|
|
Capnames map[string]int
|
|
Caplist []string
|
|
options RegexOptions
|
|
}
|
|
|
|
// It is built into a parsed tree for a regular expression.
|
|
|
|
// Implementation notes:
|
|
//
|
|
// Since the node tree is a temporary data structure only used
|
|
// during compilation of the regexp to integer codes, it's
|
|
// designed for clarity and convenience rather than
|
|
// space efficiency.
|
|
//
|
|
// RegexNodes are built into a tree, linked by the n.children list.
|
|
// Each node also has a n.parent and n.ichild member indicating
|
|
// its parent and which child # it is in its parent's list.
|
|
//
|
|
// RegexNodes come in as many types as there are constructs in
|
|
// a regular expression, for example, "concatenate", "alternate",
|
|
// "one", "rept", "group". There are also node types for basic
|
|
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
|
|
//
|
|
// Because perl 5 allows "lookback" groups that scan backwards,
|
|
// each node also gets a "direction". Normally the value of
|
|
// boolean n.backward = false.
|
|
//
|
|
// During parsing, top-level nodes are also stacked onto a parse
|
|
// stack (a stack of trees). For this purpose we have a n.next
|
|
// pointer. [Note that to save a few bytes, we could overload the
|
|
// n.parent pointer instead.]
|
|
//
|
|
// On the parse stack, each tree has a "role" - basically, the
|
|
// nonterminal in the grammar that the parser has currently
|
|
// assigned to the tree. That code is stored in n.role.
|
|
//
|
|
// Finally, some of the different kinds of nodes have data.
|
|
// Two integers (for the looping constructs) are stored in
|
|
// n.operands, an an object (either a string or a set)
|
|
// is stored in n.data
|
|
type regexNode struct {
|
|
t nodeType
|
|
children []*regexNode
|
|
str []rune
|
|
set *CharSet
|
|
ch rune
|
|
m int
|
|
n int
|
|
options RegexOptions
|
|
next *regexNode
|
|
}
|
|
|
|
type nodeType int32
|
|
|
|
const (
|
|
// The following are leaves, and correspond to primitive operations
|
|
|
|
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
|
|
ntNotonerep = 1 // lef,back char,min,max .{n}
|
|
ntSetrep = 2 // lef,back set,min,max [\d]{n}
|
|
ntOneloop = 3 // lef,back char,min,max a {,n}
|
|
ntNotoneloop = 4 // lef,back char,min,max .{,n}
|
|
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
|
|
ntOnelazy = 6 // lef,back char,min,max a {,n}?
|
|
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
|
|
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
|
|
ntOne = 9 // lef char a
|
|
ntNotone = 10 // lef char [^a]
|
|
ntSet = 11 // lef set [a-z\s] \w \s \d
|
|
ntMulti = 12 // lef string abcd
|
|
ntRef = 13 // lef group \#
|
|
ntBol = 14 // ^
|
|
ntEol = 15 // $
|
|
ntBoundary = 16 // \b
|
|
ntNonboundary = 17 // \B
|
|
ntBeginning = 18 // \A
|
|
ntStart = 19 // \G
|
|
ntEndZ = 20 // \Z
|
|
ntEnd = 21 // \Z
|
|
|
|
// Interior nodes do not correspond to primitive operations, but
|
|
// control structures compositing other operations
|
|
|
|
// Concat and alternate take n children, and can run forward or backwards
|
|
|
|
ntNothing = 22 // []
|
|
ntEmpty = 23 // ()
|
|
ntAlternate = 24 // a|b
|
|
ntConcatenate = 25 // ab
|
|
ntLoop = 26 // m,x * + ? {,}
|
|
ntLazyloop = 27 // m,x *? +? ?? {,}?
|
|
ntCapture = 28 // n ()
|
|
ntGroup = 29 // (?:)
|
|
ntRequire = 30 // (?=) (?<=)
|
|
ntPrevent = 31 // (?!) (?<!)
|
|
ntGreedy = 32 // (?>) (?<)
|
|
ntTestref = 33 // (?(n) | )
|
|
ntTestgroup = 34 // (?(...) | )
|
|
|
|
ntECMABoundary = 41 // \b
|
|
ntNonECMABoundary = 42 // \B
|
|
)
|
|
|
|
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
ch: ch,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
str: str,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
set: set,
|
|
}
|
|
}
|
|
|
|
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
m: m,
|
|
}
|
|
}
|
|
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
|
|
return ®exNode{
|
|
t: t,
|
|
options: opt,
|
|
m: m,
|
|
n: n,
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
|
|
for i := 0; i < len(n.str); i++ {
|
|
buf.WriteRune(n.str[i])
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) addChild(child *regexNode) {
|
|
reduced := child.reduce()
|
|
n.children = append(n.children, reduced)
|
|
reduced.next = n
|
|
}
|
|
|
|
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
|
|
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
|
|
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
|
|
}
|
|
|
|
// removes children including the start but not the end index
|
|
func (n *regexNode) removeChildren(startIndex, endIndex int) {
|
|
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
|
|
}
|
|
|
|
// Pass type as OneLazy or OneLoop
|
|
func (n *regexNode) makeRep(t nodeType, min, max int) {
|
|
n.t += (t - ntOne)
|
|
n.m = min
|
|
n.n = max
|
|
}
|
|
|
|
func (n *regexNode) reduce() *regexNode {
|
|
switch n.t {
|
|
case ntAlternate:
|
|
return n.reduceAlternation()
|
|
|
|
case ntConcatenate:
|
|
return n.reduceConcatenation()
|
|
|
|
case ntLoop, ntLazyloop:
|
|
return n.reduceRep()
|
|
|
|
case ntGroup:
|
|
return n.reduceGroup()
|
|
|
|
case ntSet, ntSetloop:
|
|
return n.reduceSet()
|
|
|
|
default:
|
|
return n
|
|
}
|
|
}
|
|
|
|
// Basic optimization. Single-letter alternations can be replaced
|
|
// by faster set specifications, and nested alternations with no
|
|
// intervening operators can be flattened:
|
|
//
|
|
// a|b|c|def|g|h -> [a-c]|def|[gh]
|
|
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
|
|
func (n *regexNode) reduceAlternation() *regexNode {
|
|
if len(n.children) == 0 {
|
|
return newRegexNode(ntNothing, n.options)
|
|
}
|
|
|
|
wasLastSet := false
|
|
lastNodeCannotMerge := false
|
|
var optionsLast RegexOptions
|
|
var i, j int
|
|
|
|
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
|
at := n.children[i]
|
|
|
|
if j < i {
|
|
n.children[j] = at
|
|
}
|
|
|
|
for {
|
|
if at.t == ntAlternate {
|
|
for k := 0; k < len(at.children); k++ {
|
|
at.children[k].next = n
|
|
}
|
|
n.insertChildren(i+1, at.children)
|
|
|
|
j--
|
|
} else if at.t == ntSet || at.t == ntOne {
|
|
// Cannot merge sets if L or I options differ, or if either are negated.
|
|
optionsAt := at.options & (RightToLeft | IgnoreCase)
|
|
|
|
if at.t == ntSet {
|
|
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
|
|
wasLastSet = true
|
|
lastNodeCannotMerge = !at.set.IsMergeable()
|
|
optionsLast = optionsAt
|
|
break
|
|
}
|
|
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
|
|
wasLastSet = true
|
|
lastNodeCannotMerge = false
|
|
optionsLast = optionsAt
|
|
break
|
|
}
|
|
|
|
// The last node was a Set or a One, we're a Set or One and our options are the same.
|
|
// Merge the two nodes.
|
|
j--
|
|
prev := n.children[j]
|
|
|
|
var prevCharClass *CharSet
|
|
if prev.t == ntOne {
|
|
prevCharClass = &CharSet{}
|
|
prevCharClass.addChar(prev.ch)
|
|
} else {
|
|
prevCharClass = prev.set
|
|
}
|
|
|
|
if at.t == ntOne {
|
|
prevCharClass.addChar(at.ch)
|
|
} else {
|
|
prevCharClass.addSet(*at.set)
|
|
}
|
|
|
|
prev.t = ntSet
|
|
prev.set = prevCharClass
|
|
} else if at.t == ntNothing {
|
|
j--
|
|
} else {
|
|
wasLastSet = false
|
|
lastNodeCannotMerge = false
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if j < i {
|
|
n.removeChildren(j, i)
|
|
}
|
|
|
|
return n.stripEnation(ntNothing)
|
|
}
|
|
|
|
// Basic optimization. Adjacent strings can be concatenated.
|
|
//
|
|
// (?:abc)(?:def) -> abcdef
|
|
func (n *regexNode) reduceConcatenation() *regexNode {
|
|
// Eliminate empties and concat adjacent strings/chars
|
|
|
|
var optionsLast RegexOptions
|
|
var optionsAt RegexOptions
|
|
var i, j int
|
|
|
|
if len(n.children) == 0 {
|
|
return newRegexNode(ntEmpty, n.options)
|
|
}
|
|
|
|
wasLastString := false
|
|
|
|
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
|
|
var at, prev *regexNode
|
|
|
|
at = n.children[i]
|
|
|
|
if j < i {
|
|
n.children[j] = at
|
|
}
|
|
|
|
if at.t == ntConcatenate &&
|
|
((at.options & RightToLeft) == (n.options & RightToLeft)) {
|
|
for k := 0; k < len(at.children); k++ {
|
|
at.children[k].next = n
|
|
}
|
|
|
|
//insert at.children at i+1 index in n.children
|
|
n.insertChildren(i+1, at.children)
|
|
|
|
j--
|
|
} else if at.t == ntMulti || at.t == ntOne {
|
|
// Cannot merge strings if L or I options differ
|
|
optionsAt = at.options & (RightToLeft | IgnoreCase)
|
|
|
|
if !wasLastString || optionsLast != optionsAt {
|
|
wasLastString = true
|
|
optionsLast = optionsAt
|
|
continue
|
|
}
|
|
|
|
j--
|
|
prev = n.children[j]
|
|
|
|
if prev.t == ntOne {
|
|
prev.t = ntMulti
|
|
prev.str = []rune{prev.ch}
|
|
}
|
|
|
|
if (optionsAt & RightToLeft) == 0 {
|
|
if at.t == ntOne {
|
|
prev.str = append(prev.str, at.ch)
|
|
} else {
|
|
prev.str = append(prev.str, at.str...)
|
|
}
|
|
} else {
|
|
if at.t == ntOne {
|
|
// insert at the front by expanding our slice, copying the data over, and then setting the value
|
|
prev.str = append(prev.str, 0)
|
|
copy(prev.str[1:], prev.str)
|
|
prev.str[0] = at.ch
|
|
} else {
|
|
//insert at the front...this one we'll make a new slice and copy both into it
|
|
merge := make([]rune, len(prev.str)+len(at.str))
|
|
copy(merge, at.str)
|
|
copy(merge[len(at.str):], prev.str)
|
|
prev.str = merge
|
|
}
|
|
}
|
|
} else if at.t == ntEmpty {
|
|
j--
|
|
} else {
|
|
wasLastString = false
|
|
}
|
|
}
|
|
|
|
if j < i {
|
|
// remove indices j through i from the children
|
|
n.removeChildren(j, i)
|
|
}
|
|
|
|
return n.stripEnation(ntEmpty)
|
|
}
|
|
|
|
// Nested repeaters just get multiplied with each other if they're not
|
|
// too lumpy
|
|
func (n *regexNode) reduceRep() *regexNode {
|
|
|
|
u := n
|
|
t := n.t
|
|
min := n.m
|
|
max := n.n
|
|
|
|
for {
|
|
if len(u.children) == 0 {
|
|
break
|
|
}
|
|
|
|
child := u.children[0]
|
|
|
|
// multiply reps of the same type only
|
|
if child.t != t {
|
|
childType := child.t
|
|
|
|
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
|
|
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
|
|
break
|
|
}
|
|
}
|
|
|
|
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
|
|
// [but things like (a {2,})+ are not too lumpy...]
|
|
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
|
|
break
|
|
}
|
|
|
|
u = child
|
|
if u.m > 0 {
|
|
if (math.MaxInt32-1)/u.m < min {
|
|
u.m = math.MaxInt32
|
|
} else {
|
|
u.m = u.m * min
|
|
}
|
|
}
|
|
if u.n > 0 {
|
|
if (math.MaxInt32-1)/u.n < max {
|
|
u.n = math.MaxInt32
|
|
} else {
|
|
u.n = u.n * max
|
|
}
|
|
}
|
|
}
|
|
|
|
if math.MaxInt32 == min {
|
|
return newRegexNode(ntNothing, n.options)
|
|
}
|
|
return u
|
|
|
|
}
|
|
|
|
// Simple optimization. If a concatenation or alternation has only
|
|
// one child strip out the intermediate node. If it has zero children,
|
|
// turn it into an empty.
|
|
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
|
|
switch len(n.children) {
|
|
case 0:
|
|
return newRegexNode(emptyType, n.options)
|
|
case 1:
|
|
return n.children[0]
|
|
default:
|
|
return n
|
|
}
|
|
}
|
|
|
|
func (n *regexNode) reduceGroup() *regexNode {
|
|
u := n
|
|
|
|
for u.t == ntGroup {
|
|
u = u.children[0]
|
|
}
|
|
|
|
return u
|
|
}
|
|
|
|
// Simple optimization. If a set is a singleton, an inverse singleton,
|
|
// or empty, it's transformed accordingly.
|
|
func (n *regexNode) reduceSet() *regexNode {
|
|
// Extract empty-set, one and not-one case as special
|
|
|
|
if n.set == nil {
|
|
n.t = ntNothing
|
|
} else if n.set.IsSingleton() {
|
|
n.ch = n.set.SingletonChar()
|
|
n.set = nil
|
|
n.t += (ntOne - ntSet)
|
|
} else if n.set.IsSingletonInverse() {
|
|
n.ch = n.set.SingletonChar()
|
|
n.set = nil
|
|
n.t += (ntNotone - ntSet)
|
|
}
|
|
|
|
return n
|
|
}
|
|
|
|
func (n *regexNode) reverseLeft() *regexNode {
|
|
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
|
|
//reverse children order
|
|
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
|
|
n.children[left], n.children[right] = n.children[right], n.children[left]
|
|
}
|
|
}
|
|
|
|
return n
|
|
}
|
|
|
|
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
|
|
if min == 0 && max == 0 {
|
|
return newRegexNode(ntEmpty, n.options)
|
|
}
|
|
|
|
if min == 1 && max == 1 {
|
|
return n
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOne, ntNotone, ntSet:
|
|
if lazy {
|
|
n.makeRep(Onelazy, min, max)
|
|
} else {
|
|
n.makeRep(Oneloop, min, max)
|
|
}
|
|
return n
|
|
|
|
default:
|
|
var t nodeType
|
|
if lazy {
|
|
t = ntLazyloop
|
|
} else {
|
|
t = ntLoop
|
|
}
|
|
result := newRegexNodeMN(t, n.options, min, max)
|
|
result.addChild(n)
|
|
return result
|
|
}
|
|
}
|
|
|
|
// debug functions
|
|
|
|
var typeStr = []string{
|
|
"Onerep", "Notonerep", "Setrep",
|
|
"Oneloop", "Notoneloop", "Setloop",
|
|
"Onelazy", "Notonelazy", "Setlazy",
|
|
"One", "Notone", "Set",
|
|
"Multi", "Ref",
|
|
"Bol", "Eol", "Boundary", "Nonboundary",
|
|
"Beginning", "Start", "EndZ", "End",
|
|
"Nothing", "Empty",
|
|
"Alternate", "Concatenate",
|
|
"Loop", "Lazyloop",
|
|
"Capture", "Group", "Require", "Prevent", "Greedy",
|
|
"Testref", "Testgroup",
|
|
"Unknown", "Unknown", "Unknown",
|
|
"Unknown", "Unknown", "Unknown",
|
|
"ECMABoundary", "NonECMABoundary",
|
|
}
|
|
|
|
func (n *regexNode) description() string {
|
|
buf := &bytes.Buffer{}
|
|
|
|
buf.WriteString(typeStr[n.t])
|
|
|
|
if (n.options & ExplicitCapture) != 0 {
|
|
buf.WriteString("-C")
|
|
}
|
|
if (n.options & IgnoreCase) != 0 {
|
|
buf.WriteString("-I")
|
|
}
|
|
if (n.options & RightToLeft) != 0 {
|
|
buf.WriteString("-L")
|
|
}
|
|
if (n.options & Multiline) != 0 {
|
|
buf.WriteString("-M")
|
|
}
|
|
if (n.options & Singleline) != 0 {
|
|
buf.WriteString("-S")
|
|
}
|
|
if (n.options & IgnorePatternWhitespace) != 0 {
|
|
buf.WriteString("-X")
|
|
}
|
|
if (n.options & ECMAScript) != 0 {
|
|
buf.WriteString("-E")
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
|
|
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
|
|
break
|
|
case ntCapture:
|
|
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
|
|
break
|
|
case ntRef, ntTestref:
|
|
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
|
|
break
|
|
case ntMulti:
|
|
fmt.Fprintf(buf, "(String = %s)", string(n.str))
|
|
break
|
|
case ntSet, ntSetloop, ntSetlazy:
|
|
buf.WriteString("(Set = " + n.set.String() + ")")
|
|
break
|
|
}
|
|
|
|
switch n.t {
|
|
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
|
|
buf.WriteString("(Min = ")
|
|
buf.WriteString(strconv.Itoa(n.m))
|
|
buf.WriteString(", Max = ")
|
|
if n.n == math.MaxInt32 {
|
|
buf.WriteString("inf")
|
|
} else {
|
|
buf.WriteString(strconv.Itoa(n.n))
|
|
}
|
|
buf.WriteString(")")
|
|
|
|
break
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
var padSpace = []byte(" ")
|
|
|
|
func (t *RegexTree) Dump() string {
|
|
return t.root.dump()
|
|
}
|
|
|
|
func (n *regexNode) dump() string {
|
|
var stack []int
|
|
CurNode := n
|
|
CurChild := 0
|
|
|
|
buf := bytes.NewBufferString(CurNode.description())
|
|
buf.WriteRune('\n')
|
|
|
|
for {
|
|
if CurNode.children != nil && CurChild < len(CurNode.children) {
|
|
stack = append(stack, CurChild+1)
|
|
CurNode = CurNode.children[CurChild]
|
|
CurChild = 0
|
|
|
|
Depth := len(stack)
|
|
if Depth > 32 {
|
|
Depth = 32
|
|
}
|
|
buf.Write(padSpace[:Depth])
|
|
buf.WriteString(CurNode.description())
|
|
buf.WriteRune('\n')
|
|
} else {
|
|
if len(stack) == 0 {
|
|
break
|
|
}
|
|
|
|
CurChild = stack[len(stack)-1]
|
|
stack = stack[:len(stack)-1]
|
|
CurNode = CurNode.next
|
|
}
|
|
}
|
|
return buf.String()
|
|
}
|