Pull request 1924: 6003-relax-rule-validation
Updates #6003. Squashed commit of the following: commit 1874860877662999d158631e3a25f8072c24f155 Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 19:36:26 2023 +0300 filtering/rulelist: imp test commit 871a41af8039bf4d4fb139622d4296bcaff6729c Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 19:10:35 2023 +0300 filtering/rulelist: relax validation
This commit is contained in:
parent
f22d893845
commit
2adc8624c0
|
@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
|
- `bufio.Scanner: token too long` and other errors when trying to add
|
||||||
with lines over 1024 bytes long ([#6003]).
|
filtering-rule lists with lines over 1024 bytes long or containing cosmetic
|
||||||
|
rules ([#6003]).
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,9 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/crc32"
|
"hash/crc32"
|
||||||
"io"
|
"io"
|
||||||
"unicode"
|
|
||||||
"unicode/utf8"
|
|
||||||
|
|
||||||
"github.com/AdguardTeam/golibs/errors"
|
"github.com/AdguardTeam/golibs/errors"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parser is a filtering-rule parser that collects data, such as the checksum
|
// Parser is a filtering-rule parser that collects data, such as the checksum
|
||||||
|
@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
|
||||||
badIdx, isRule = p.parseLineTitle(trimmed)
|
badIdx, isRule = p.parseLineTitle(trimmed)
|
||||||
}
|
}
|
||||||
if badIdx != -1 {
|
if badIdx != -1 {
|
||||||
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
|
|
||||||
|
|
||||||
return 0, fmt.Errorf(
|
return 0, fmt.Errorf(
|
||||||
"line %d: character %d: non-printable character %q",
|
"line %d: character %d: likely binary character %q",
|
||||||
lineNum,
|
lineNum,
|
||||||
badIdx+bytes.Index(line, trimmed)+1,
|
badIdx+bytes.Index(line, trimmed)+1,
|
||||||
badRune,
|
trimmed[badIdx],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseLine returns true if the parsed line is a filtering rule. line is
|
// parseLine returns true if the parsed line is a filtering rule. line is
|
||||||
// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of
|
// assumed to be trimmed of whitespace characters. badIdx is the index of the
|
||||||
// the first non-printable character, if any; if there are none, nonPrintIdx is
|
// first character that may indicate that this is a binary file, or -1 if none.
|
||||||
// -1.
|
|
||||||
//
|
//
|
||||||
// A line is considered a rule if it's not empty, not a comment, and contains
|
// A line is considered a rule if it's not empty, not a comment, and contains
|
||||||
// only printable characters.
|
// only printable characters.
|
||||||
func parseLine(line []byte) (nonPrintIdx int, isRule bool) {
|
func parseLine(line []byte) (badIdx int, isRule bool) {
|
||||||
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
|
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
|
||||||
return -1, false
|
return -1, false
|
||||||
}
|
}
|
||||||
|
|
||||||
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
|
badIdx = slices.IndexFunc(line, likelyBinary)
|
||||||
|
|
||||||
return nonPrintIdx, nonPrintIdx == -1
|
return badIdx, badIdx == -1
|
||||||
}
|
}
|
||||||
|
|
||||||
// isNotPrintable returns true if r is not a printable character that can be
|
// likelyBinary returns true if b is likely to be a byte from a binary file.
|
||||||
// contained in a filtering rule.
|
func likelyBinary(b byte) (ok bool) {
|
||||||
func isNotPrintable(r rune) (ok bool) {
|
return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
|
||||||
// Tab isn't included into Unicode's graphic symbols, so include it here
|
|
||||||
// explicitly.
|
|
||||||
return r != '\t' && !unicode.IsGraphic(r)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseLineTitle is like [parseLine] but additionally looks for a title. line
|
// parseLineTitle is like [parseLine] but additionally looks for a title. line
|
||||||
// is assumed to be trimmed of whitespace characters.
|
// is assumed to be trimmed of whitespace characters.
|
||||||
func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) {
|
func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
|
||||||
if len(line) == 0 || line[0] == '#' {
|
if len(line) == 0 || line[0] == '#' {
|
||||||
return -1, false
|
return -1, false
|
||||||
}
|
}
|
||||||
|
|
||||||
if line[0] != '!' {
|
if line[0] != '!' {
|
||||||
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
|
badIdx = slices.IndexFunc(line, likelyBinary)
|
||||||
|
|
||||||
return nonPrintIdx, nonPrintIdx == -1
|
return badIdx, badIdx == -1
|
||||||
}
|
}
|
||||||
|
|
||||||
const titlePattern = "! Title: "
|
const titlePattern = "! Title: "
|
||||||
|
|
|
@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
|
||||||
wantTitle: "Test Title",
|
wantTitle: "Test Title",
|
||||||
wantRulesNum: 1,
|
wantRulesNum: 1,
|
||||||
wantWritten: len(testRuleTextBlocked),
|
wantWritten: len(testRuleTextBlocked),
|
||||||
|
}, {
|
||||||
|
name: "cosmetic_with_zwnj",
|
||||||
|
in: testRuleTextCosmetic,
|
||||||
|
wantDst: testRuleTextCosmetic,
|
||||||
|
wantErrMsg: "",
|
||||||
|
wantTitle: "",
|
||||||
|
wantRulesNum: 1,
|
||||||
|
wantWritten: len(testRuleTextCosmetic),
|
||||||
}, {
|
}, {
|
||||||
name: "bad_char",
|
name: "bad_char",
|
||||||
in: "! Title: Test Title \n" +
|
in: "! Title: Test Title \n" +
|
||||||
|
@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
|
||||||
wantDst: testRuleTextBlocked,
|
wantDst: testRuleTextBlocked,
|
||||||
wantErrMsg: "line 3: " +
|
wantErrMsg: "line 3: " +
|
||||||
"character 4: " +
|
"character 4: " +
|
||||||
"non-printable character '\\x7f'",
|
"likely binary character '\\x7f'",
|
||||||
wantTitle: "Test Title",
|
wantTitle: "Test Title",
|
||||||
wantRulesNum: 1,
|
wantRulesNum: 1,
|
||||||
wantWritten: len(testRuleTextBlocked),
|
wantWritten: len(testRuleTextBlocked),
|
||||||
|
@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {
|
||||||
|
|
||||||
require.NoError(b, errSink)
|
require.NoError(b, errSink)
|
||||||
require.NotNil(b, resSink)
|
require.NotNil(b, resSink)
|
||||||
|
|
||||||
|
// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
|
||||||
|
//
|
||||||
|
// goos: linux
|
||||||
|
// goarch: amd64
|
||||||
|
// pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
|
||||||
|
// cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
|
||||||
|
// BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op
|
||||||
}
|
}
|
||||||
|
|
||||||
func FuzzParser_Parse(f *testing.F) {
|
func FuzzParser_Parse(f *testing.F) {
|
||||||
|
@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
|
||||||
"! Comment",
|
"! Comment",
|
||||||
"! Title ",
|
"! Title ",
|
||||||
"! Title XXX",
|
"! Title XXX",
|
||||||
|
testRuleTextBadTab,
|
||||||
|
testRuleTextBlocked,
|
||||||
|
testRuleTextCosmetic,
|
||||||
testRuleTextEtcHostsTab,
|
testRuleTextEtcHostsTab,
|
||||||
testRuleTextHTML,
|
testRuleTextHTML,
|
||||||
testRuleTextBlocked,
|
|
||||||
testRuleTextBadTab,
|
|
||||||
"1.2.3.4",
|
"1.2.3.4",
|
||||||
"1.2.3.4 etc-hosts.example",
|
"1.2.3.4 etc-hosts.example",
|
||||||
">>>\x00<<<",
|
">>>\x00<<<",
|
||||||
">>>\x7F<<<",
|
">>>\x7F<<<",
|
||||||
strings.Repeat("a", n+1),
|
strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
|
||||||
|
strings.Repeat("a", bufio.MaxScanTokenSize+1),
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
|
|
|
@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second
|
||||||
|
|
||||||
// Common texts for tests.
|
// Common texts for tests.
|
||||||
const (
|
const (
|
||||||
testRuleTextHTML = "<!DOCTYPE html>\n"
|
|
||||||
testRuleTextBlocked = "||blocked.example^\n"
|
|
||||||
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
|
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
|
||||||
|
testRuleTextBlocked = "||blocked.example^\n"
|
||||||
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
|
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
|
||||||
|
testRuleTextHTML = "<!DOCTYPE html>\n"
|
||||||
|
|
||||||
|
// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
|
||||||
|
//
|
||||||
|
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
|
||||||
|
testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue