使用PEG实现一个高性能的过滤器

说实话大家如果开发过一些研发工具的话或多或少会遇到定制过滤器（Filter）的能力，主要场景比如动态的匹配一些流量，动态匹配一些规则等，为此通常平台方会简单提供一些过滤能力比如仅支持and 操作，不支持复杂的逻辑判断，或者引入lua 、 python、JavaScript 这种动态语言来解决此问题，但是这些性能都很差，为此可以自己实现一个过滤表达式，性能也可以做到最优解！

介绍

这里我们要实现一个逻辑表达式，大概大家懂了我的想法了吧，我这里实现了一个抓取流量的工具，但是需要实现一个过滤的表达式去实现请求过滤！

1	host='www.douyin.com' and http_code in (400, 500)

或者

1	host='www.douyin.com' and (http_code = 400 or http_code = 500)

这里我实现的比较复杂，支持类似于

1	a='1' and b='2' or (c=3 and (d='4' or f='5') and (x='6' or y='7' and z in (1,2)))

定义数据结构（AST）

其实实现parser的第一步就是需要定义数据结构，需要想到怎么描述这个AST，简单的规则说实话直接搞个数组就行了，但是对于()分组规则呢？对于and 和 or 混合使用呢？所以它是一个单链表！

type Conditions struct {
	Group    *Conditions `json:"group,omitempty"`  // 当前节点可能是一个group
	Value    *Condition  `json:"value,omitempty"`
	Logical  string      `json:"logical,omitempty"` // 逻辑符号 and / or
	Next     *Conditions `json:"next,omitempty"` // 下一个节点
}

type Condition struct {
	Key      string   `json:"key,omitempty"`
	Operator string   `json:"operator,omitempty"` // 运算符号 > / < / = / in / !=
	Value    string   `json:"value,omitempty"`
	Values   []string `json:"values,omitempty"` // operator=in 使用此字段
}

定义 PEG

这里我使用的PEG库是 https://github.com/mna/pigeon 这个

{
package parser

type Conditions struct {
	Group    *Conditions `json:"group,omitempty"`
	Value    *Condition  `json:"value,omitempty"`
	Logical  string      `json:"logical,omitempty"` // 逻辑符号 and / or
	Next     *Conditions `json:"next,omitempty"`
}

type Condition struct {
	Key      string   `json:"key,omitempty"`
	Operator string   `json:"operator,omitempty"` // 运算符号 > / < / = / in / !=
	Value    string   `json:"value,omitempty"`
	Values   []string `json:"values,omitempty"`
}

}

Grammar <- values:Conditions+ EOF {
	return values.([]interface{})[0].(*Conditions), nil
} / SyntaxError


SyntaxError <- . {
	return nil, errors.New("parser: syntax error")
}


ConditionGroup <- '(' __ values: Conditions* __ ')' {
    if len(values.([]interface{})) == 0 {
        return nil, nil
    }
    return values.([]interface{})[0].(*Conditions), nil
}

Conditions <- values:( ( Condition / ConditionGroup ) __ LogicalOperator? __ )+ {
	head := &Conditions{}
	cur := head
	lastIndex := len(values.([]interface{})) - 1
	for index, value := range values.([]interface{}) {
		args := value.([]interface{})
		switch arg0 := args[0].(type) {
		case *Condition:
			cur.Value = arg0
		case *Conditions:
			cur.Group = arg0
		}
		cur.Logical, _ = args[2].(string)
		if index == lastIndex {
			break
		}
		cur.Next = &Conditions{}
		cur = cur.Next
	}
	return head, nil
}

LogicalOperator <- ( "and" / "or" ) {
    return string(c.text), nil
}

Condition <- key:Identifier __ op:Operator __ value:( Double / Integer / String / List) {
	ret := &Condition{
		Key:      key.(string),
		Operator: op.(string),
	}
	if vs, isOK := value.([]string); isOK {
		ret.Values = vs
	}
	if v, isOK := value.(string); isOK {
		ret.Value = v
	}
	return ret, nil
}

Integer <- '-'? Digit+ {
	if _, err := strconv.ParseInt(string(c.text), 10, 64); err != nil {
		return nil, err
	}
	return string(c.text), nil
}

Double ← [+-]? Digit+ '.' Digit+ {
	if _, err := strconv.ParseFloat(string(c.text), 64); err != nil {
		return nil, err
	}
	return string(c.text), nil
}

String <- Literal

List <- '(' __ values:(Literal __ ListSeparator? __)* __ ')' {
	result := make([]string, 0)
	for _, value := range values.([]interface{}) {
		args, _ := value.([]interface{})
		result = append(result, args[0].(string))
	}
	return result, nil
}

Literal <- (('"' (`\"` / [^"])* '"') / ('\'' (`\'` / [^'])* '\'')) {
	if len(c.text) == 0 {
		return "", nil
	}
    switch c.text[0] {
    case '\'':
        return strings.Replace(string(c.text[1:len(c.text)-1]), `\'`, `'`, -1), nil
    case '"':
        return strings.Replace(string(c.text[1:len(c.text)-1]), `\"`, `"`, -1), nil
    }
    return string(c.text) ,nil
}


Operator <- ( "in" / ">=" / "<=" / "!=" / "=" / ">" / "<" ) {
    return string(c.text), nil
}

Identifier <- (Letter / '_')+ (Letter / Digit / '.' / '_' / '[' / ']' )* {
	return string(c.text), nil
}

ListSeparator <- [,]
Letter <- [A-Za-z]
Digit <- [0-9]

__ <- (_)*
_ <- [ \n\t\r]+

EOF <- !.

编译

1	pigeon -o condition.peg.go ./condition.peg

注意

很多时候匹配失败可能是规则顺序的问题！例如 Double / Integer 如果改成 Integer / Double 就会报错对于 a=1.1 这种case！因为啥呢？

Integer <- '-'? Digit+ {
	if _, err := strconv.ParseInt(string(c.text), 10, 64); err != nil {
		return nil, err
	}
	return string(c.text), nil
}

Double ← [+-]? Digit+ '.' Digit+ {
	if _, err := strconv.ParseFloat(string(c.text), 64); err != nil {
		return nil, err
	}
	return string(c.text), nil
}

我们发现这俩规则基本上一样，如果是Integer/Double 那么匹配是惰性匹配，它不会贪心的去搜索，直接找到1 就去处理了，然后成功，但是剩余了.1 导致解析失败了, 他不会走 Double 这个分支解析，这也就是为啥会失败了，我们应该注意优先顺序！

获取AST

package parser

import (
	"encoding/json"
	"testing"
)

func TestParse(t *testing.T) {
	input := `a='1' and b='2' or (c=3 and (d='4' or f='5') and (x='6' or y='7' and z in (1,2)))`
	parse, err := Parse("", []byte(input))
	if err != nil {
		t.Fatal(err)
	}
	marshal, _ := json.MarshalIndent(parse, "", "\t")
	t.Log(string(marshal))
}

输出

{
    "value": {
        "key": "a",
        "operator": "=",
        "value": "1"
    },
    "logical": "and",
    "next": {
        "value": {
            "key": "b",
            "operator": "=",
            "value": "2"
        },
        "logical": "or",
        "next": {
            "group": {
                "value": {
                    "key": "c",
                    "operator": "=",
                    "value": "3"
                },
                "logical": "and",
                "next": {
                    "group": {
                        "value": {
                            "key": "d",
                            "operator": "=",
                            "value": "4"
                        },
                        "logical": "or",
                        "next": {
                            "value": {
                                "key": "f",
                                "operator": "=",
                                "value": "5"
                            }
                        }
                    },
                    "logical": "and",
                    "next": {
                        "group": {
                            "value": {
                                "key": "x",
                                "operator": "=",
                                "value": "6"
                            },
                            "logical": "or",
                            "next": {
                                "value": {
                                    "key": "y",
                                    "operator": "=",
                                    "value": "7"
                                },
                                "logical": "and",
                                "next": {
                                    "value": {
                                        "key": "z",
                                        "operator": "in",
                                        "values": [
                                            "1",
                                            "2"
                                        ]
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

封装过滤器

封装逻辑

type MapInput map[string]string

func (m MapInput) Get(key string) string {
	return m[key]
}

// 抽象一个Input接口
type Input interface {
	Get(key string) string
}

// 逻辑判断
func (c *Conditions) Result(intput Input) bool {
	var result bool
	if c.Value != nil {
		result = c.Value.Result(intput)
	} else if c.Group != nil {
		result = c.Group.Result(intput)
	} else {
    result = true // 什么也没定义，类似于 () 这种
	}
	if c.Next == nil {
		return result
	}
	if c.Logical == "and" {
		return result && c.Next.Result(intput)
	}
	return result || c.Next.Result(intput)
}

// 逻辑判断
func (c *Condition) Result(input Input) bool {
	value := input.Get(c.Key)
	switch c.Operator {
	case "in":
		return utils.Contains(c.Values, value)
	case "=":
		return value == c.Value
	case "!=":
		return value != c.Value
	case ">":
		return value > c.Value
	case "<":
		return value < c.Value
	case ">=":
		return value >= c.Value
	case "<=":
		return value <= c.Value
	default:
		panic(fmt.Sprintf(`invalid operator (%s)`, c.Operator))
	}
}

测试

func TestRule(t *testing.T) {
	intput := MapInput{
		"a": "1",
		"b": "2",
		"c": "3",
		"d": "4",
		"e": "5.5",
		"f": "xiaoming",
	}
	rule, err := ParseRule(`a = 1 and b = 2 and ( c = 4 or ( d = 4 and e in ( 5.5 , 6.6 ) ) ) and f = "xiaoming"`)
	if err != nil {
		t.Fatal(err)
	}
	result := rule.Result(intput)
	t.Log(result)
}

func BenchmarkRule(b *testing.B) {
	b.StopTimer()
	intput := MapInput{
		"a": "1",
		"b": "2",
		"c": "3",
		"d": "4",
		"e": "5.5",
		"f": "xiaoming",
	}
	parseRule, err := ParseRule(`a = 1 and b = 2 and ( c = 4 or ( d = 4 and e in ( 5.5 , 6.6 ) ) ) and f = "xiaoming"`)
	if err != nil {
		b.Fatal(err)
	}
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		if !parseRule.Result(intput) {
			b.Fatal("must ture")
		}
	}
}

benchmark

~/go/src/github.com/anthony-dong/golang go test  -v -run=none -bench=BenchmarkRule -count=5 -benchmem ./pkg/filter/...
goos: linux
goarch: amd64
pkg: github.com/anthony-dong/golang/pkg/filter
cpu: Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz
BenchmarkRule
BenchmarkRule-8   	 5362532	       224.3 ns/op	       0 B/op	       0 allocs/op
BenchmarkRule-8   	 5361560	       219.5 ns/op	       0 B/op	       0 allocs/op
BenchmarkRule-8   	 5391529	       224.4 ns/op	       0 B/op	       0 allocs/op
BenchmarkRule-8   	 5425616	       221.1 ns/op	       0 B/op	       0 allocs/op
BenchmarkRule-8   	 5365962	       222.1 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	github.com/anthony-dong/golang/pkg/filter	7.138s

可以看到性能是非常的高，也基本没啥开销！

总结

说实话我实现的这个过滤表达式其实并不难，但是难的是实际上是PEG 、BNF 、ANTLR4、Lex/YACC 这种词法生成器！有兴趣的同学可以尝试实现一个词法生成器，可以深入理解编译原理！

相关代码实现可以看这里：https://github.com/Anthony-Dong/golang/tree/master/pkg/filter