dnsfilter -- avoid using regexps when simple suffix match is enough.

This covers 96.98% of all adguard dns rules.
This commit is contained in:
Eugene Bujak 2018-10-04 13:19:43 +03:00
parent 9e939e5754
commit cb97a254a5
3 changed files with 94 additions and 5 deletions

View File

@ -67,6 +67,10 @@ type rule struct {
// user-supplied data
listID uint32
// suffix matching
isSuffix bool
suffix string
// compiled regexp
compiled *regexp.Regexp
@ -387,12 +391,21 @@ func (rule *rule) extractShortcut() {
func (rule *rule) compile() error {
rule.RLock()
isCompiled := rule.compiled != nil
isCompiled := rule.isSuffix || rule.compiled != nil
rule.RUnlock()
if isCompiled {
return nil
}
isSuffix, suffix := getSuffix(rule.text)
if isSuffix {
rule.Lock()
rule.isSuffix = isSuffix
rule.suffix = suffix
rule.Unlock()
return nil
}
expr, err := ruleToRegexp(rule.text)
if err != nil {
return err
@ -417,7 +430,16 @@ func (rule *rule) match(host string) (Result, error) {
return res, err
}
rule.RLock()
matched := rule.compiled.MatchString(host)
matched := false
if rule.isSuffix {
if host == rule.suffix {
matched = true
} else if strings.HasSuffix(host, "."+rule.suffix) {
matched = true
}
} else {
matched = rule.compiled.MatchString(host)
}
rule.RUnlock()
if matched {
res.Reason = FilteredBlackList

View File

@ -195,7 +195,7 @@ func TestRuleToRegexp(t *testing.T) {
{"/doubleclick/", "doubleclick", nil},
{"/", "", ErrInvalidSyntax},
{`|double*?.+[]|(){}#$\|`, `^double.*\?\.\+\[\]\|\(\)\{\}\#\$\\$`, nil},
{`||doubleclick.net^`, `^([a-z0-9-_.]+\.)?doubleclick\.net([^ a-zA-Z0-9.%]|$)`, nil},
{`||doubleclick.net^`, `(?:^|\.)doubleclick\.net$`, nil},
}
for _, testcase := range tests {
converted, err := ruleToRegexp(testcase.rule)
@ -208,6 +208,38 @@ func TestRuleToRegexp(t *testing.T) {
}
}
func TestSuffixRule(t *testing.T) {
for _, testcase := range []struct {
rule string
isSuffix bool
suffix string
}{
{`||doubleclick.net^`, true, `doubleclick.net`}, // entire string or subdomain match
{`||doubleclick.net|`, true, `doubleclick.net`}, // entire string or subdomain match
{`|doubleclick.net^`, false, ``}, // TODO: ends with doubleclick.net
{`*doubleclick.net^`, false, ``}, // TODO: ends with doubleclick.net
{`doubleclick.net^`, false, ``}, // TODO: ends with doubleclick.net
{`|*doubleclick.net^`, false, ``}, // TODO: ends with doubleclick.net
{`||*doubleclick.net^`, false, ``}, // TODO: ends with doubleclick.net
{`||*doubleclick.net|`, false, ``}, // TODO: ends with doubleclick.net
{`||*doublec*lick.net^`, false, ``}, // has a wildcard inside, has to be regexp
{`||*doublec|lick.net^`, false, ``}, // has a special symbol inside, has to be regexp
{`/abracadabra/`, false, ``}, // regexp, not anchored
{`/abracadabra$/`, false, ``}, // TODO: simplify simple suffix regexes
} {
isSuffix, suffix := getSuffix(testcase.rule)
if testcase.isSuffix != isSuffix {
t.Errorf("Results do not match for \"%s\": got %v expected %v", testcase.rule, isSuffix, testcase.isSuffix)
continue
}
if testcase.isSuffix && testcase.suffix != suffix {
t.Errorf("Result suffix does not match for \"%s\": got \"%s\" expected \"%s\"", testcase.rule, suffix, testcase.suffix)
continue
}
// trace("\"%s\": %v: %s", testcase.rule, isSuffix, suffix)
}
}
//
// helper functions
//

View File

@ -5,8 +5,8 @@ import (
)
func ruleToRegexp(rule string) (string, error) {
const hostStart = "^([a-z0-9-_.]+\\.)?"
const hostEnd = "([^ a-zA-Z0-9.%]|$)"
const hostStart = `(?:^|\.)`
const hostEnd = `$`
// empty or short rule -- do nothing
if !isValidRule(rule) {
@ -49,3 +49,38 @@ func ruleToRegexp(rule string) (string, error) {
return sb.String(), nil
}
// handle suffix rule ||example.com^ -- either entire string is example.com or *.example.com
func getSuffix(rule string) (bool, string) {
// if starts with / and ends with /, it's already a regexp
// TODO: if a regexp is simple `/abracadabra$/`, then simplify it maybe?
if rule[0] == '/' && rule[len(rule)-1] == '/' {
return false, ""
}
// must start with ||
if rule[0] != '|' || rule[1] != '|' {
return false, ""
}
rule = rule[2:]
// suffix rule must end with ^ or |
lastChar := rule[len(rule)-1]
if lastChar != '^' && lastChar != '|' {
return false, ""
}
// last char was checked, eat it
rule = rule[:len(rule)-1]
// check that it doesn't have any special characters inside
for _, r := range rule {
switch r {
case '|':
return false, ""
case '*':
return false, ""
}
}
return true, rule
}