Pull request 1924: 6003-relax-rule-validation
Updates #6003. Squashed commit of the following: commit 1874860877662999d158631e3a25f8072c24f155 Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 19:36:26 2023 +0300 filtering/rulelist: imp test commit 871a41af8039bf4d4fb139622d4296bcaff6729c Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 19:10:35 2023 +0300 filtering/rulelist: relax validation
This commit is contained in:
parent
f22d893845
commit
2adc8624c0
|
@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.
|
|||
|
||||
### Fixed
|
||||
|
||||
- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
|
||||
with lines over 1024 bytes long ([#6003]).
|
||||
- `bufio.Scanner: token too long` and other errors when trying to add
|
||||
filtering-rule lists with lines over 1024 bytes long or containing cosmetic
|
||||
rules ([#6003]).
|
||||
|
||||
### Removed
|
||||
|
||||
|
|
|
@ -6,10 +6,9 @@ import (
|
|||
"fmt"
|
||||
"hash/crc32"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/AdguardTeam/golibs/errors"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
// Parser is a filtering-rule parser that collects data, such as the checksum
|
||||
|
@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
|
|||
badIdx, isRule = p.parseLineTitle(trimmed)
|
||||
}
|
||||
if badIdx != -1 {
|
||||
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
|
||||
|
||||
return 0, fmt.Errorf(
|
||||
"line %d: character %d: non-printable character %q",
|
||||
"line %d: character %d: likely binary character %q",
|
||||
lineNum,
|
||||
badIdx+bytes.Index(line, trimmed)+1,
|
||||
badRune,
|
||||
trimmed[badIdx],
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
|
|||
}
|
||||
|
||||
// parseLine returns true if the parsed line is a filtering rule. line is
|
||||
// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of
|
||||
// the first non-printable character, if any; if there are none, nonPrintIdx is
|
||||
// -1.
|
||||
// assumed to be trimmed of whitespace characters. badIdx is the index of the
|
||||
// first character that may indicate that this is a binary file, or -1 if none.
|
||||
//
|
||||
// A line is considered a rule if it's not empty, not a comment, and contains
|
||||
// only printable characters.
|
||||
func parseLine(line []byte) (nonPrintIdx int, isRule bool) {
|
||||
func parseLine(line []byte) (badIdx int, isRule bool) {
|
||||
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
|
||||
return -1, false
|
||||
}
|
||||
|
||||
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
|
||||
badIdx = slices.IndexFunc(line, likelyBinary)
|
||||
|
||||
return nonPrintIdx, nonPrintIdx == -1
|
||||
return badIdx, badIdx == -1
|
||||
}
|
||||
|
||||
// isNotPrintable returns true if r is not a printable character that can be
|
||||
// contained in a filtering rule.
|
||||
func isNotPrintable(r rune) (ok bool) {
|
||||
// Tab isn't included into Unicode's graphic symbols, so include it here
|
||||
// explicitly.
|
||||
return r != '\t' && !unicode.IsGraphic(r)
|
||||
// likelyBinary returns true if b is likely to be a byte from a binary file.
|
||||
func likelyBinary(b byte) (ok bool) {
|
||||
return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
|
||||
}
|
||||
|
||||
// parseLineTitle is like [parseLine] but additionally looks for a title. line
|
||||
// is assumed to be trimmed of whitespace characters.
|
||||
func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) {
|
||||
func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
|
||||
if len(line) == 0 || line[0] == '#' {
|
||||
return -1, false
|
||||
}
|
||||
|
||||
if line[0] != '!' {
|
||||
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
|
||||
badIdx = slices.IndexFunc(line, likelyBinary)
|
||||
|
||||
return nonPrintIdx, nonPrintIdx == -1
|
||||
return badIdx, badIdx == -1
|
||||
}
|
||||
|
||||
const titlePattern = "! Title: "
|
||||
|
|
|
@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
|
|||
wantTitle: "Test Title",
|
||||
wantRulesNum: 1,
|
||||
wantWritten: len(testRuleTextBlocked),
|
||||
}, {
|
||||
name: "cosmetic_with_zwnj",
|
||||
in: testRuleTextCosmetic,
|
||||
wantDst: testRuleTextCosmetic,
|
||||
wantErrMsg: "",
|
||||
wantTitle: "",
|
||||
wantRulesNum: 1,
|
||||
wantWritten: len(testRuleTextCosmetic),
|
||||
}, {
|
||||
name: "bad_char",
|
||||
in: "! Title: Test Title \n" +
|
||||
|
@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
|
|||
wantDst: testRuleTextBlocked,
|
||||
wantErrMsg: "line 3: " +
|
||||
"character 4: " +
|
||||
"non-printable character '\\x7f'",
|
||||
"likely binary character '\\x7f'",
|
||||
wantTitle: "Test Title",
|
||||
wantRulesNum: 1,
|
||||
wantWritten: len(testRuleTextBlocked),
|
||||
|
@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {
|
|||
|
||||
require.NoError(b, errSink)
|
||||
require.NotNil(b, resSink)
|
||||
|
||||
// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
|
||||
//
|
||||
// goos: linux
|
||||
// goarch: amd64
|
||||
// pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
|
||||
// cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
|
||||
// BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op
|
||||
}
|
||||
|
||||
func FuzzParser_Parse(f *testing.F) {
|
||||
|
@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
|
|||
"! Comment",
|
||||
"! Title ",
|
||||
"! Title XXX",
|
||||
testRuleTextBadTab,
|
||||
testRuleTextBlocked,
|
||||
testRuleTextCosmetic,
|
||||
testRuleTextEtcHostsTab,
|
||||
testRuleTextHTML,
|
||||
testRuleTextBlocked,
|
||||
testRuleTextBadTab,
|
||||
"1.2.3.4",
|
||||
"1.2.3.4 etc-hosts.example",
|
||||
">>>\x00<<<",
|
||||
">>>\x7F<<<",
|
||||
strings.Repeat("a", n+1),
|
||||
strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
|
||||
strings.Repeat("a", bufio.MaxScanTokenSize+1),
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
|
|
|
@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second
|
|||
|
||||
// Common texts for tests.
|
||||
const (
|
||||
testRuleTextHTML = "<!DOCTYPE html>\n"
|
||||
testRuleTextBlocked = "||blocked.example^\n"
|
||||
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
|
||||
testRuleTextBlocked = "||blocked.example^\n"
|
||||
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
|
||||
testRuleTextHTML = "<!DOCTYPE html>\n"
|
||||
|
||||
// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
|
||||
//
|
||||
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
|
||||
testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue