diff --git a/CHANGELOG.md b/CHANGELOG.md index 851caebd..fb6c12b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT. ### Fixed -- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists - with lines over 1024 bytes long ([#6003]). +- `bufio.Scanner: token too long` and other errors when trying to add + filtering-rule lists with lines over 1024 bytes long or containing cosmetic + rules ([#6003]). ### Removed diff --git a/internal/filtering/rulelist/parser.go b/internal/filtering/rulelist/parser.go index e98567d8..24d19b9c 100644 --- a/internal/filtering/rulelist/parser.go +++ b/internal/filtering/rulelist/parser.go @@ -6,10 +6,9 @@ import ( "fmt" "hash/crc32" "io" - "unicode" - "unicode/utf8" "github.com/AdguardTeam/golibs/errors" + "golang.org/x/exp/slices" ) // Parser is a filtering-rule parser that collects data, such as the checksum @@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er badIdx, isRule = p.parseLineTitle(trimmed) } if badIdx != -1 { - badRune, _ := utf8.DecodeRune(trimmed[badIdx:]) - return 0, fmt.Errorf( - "line %d: character %d: non-printable character %q", + "line %d: character %d: likely binary character %q", lineNum, badIdx+bytes.Index(line, trimmed)+1, - badRune, + trimmed[badIdx], ) } @@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) { } // parseLine returns true if the parsed line is a filtering rule. line is -// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of -// the first non-printable character, if any; if there are none, nonPrintIdx is -// -1. +// assumed to be trimmed of whitespace characters. badIdx is the index of the +// first character that may indicate that this is a binary file, or -1 if none. // // A line is considered a rule if it's not empty, not a comment, and contains // only printable characters. -func parseLine(line []byte) (nonPrintIdx int, isRule bool) { +func parseLine(line []byte) (badIdx int, isRule bool) { if len(line) == 0 || line[0] == '#' || line[0] == '!' { return -1, false } - nonPrintIdx = bytes.IndexFunc(line, isNotPrintable) + badIdx = slices.IndexFunc(line, likelyBinary) - return nonPrintIdx, nonPrintIdx == -1 + return badIdx, badIdx == -1 } -// isNotPrintable returns true if r is not a printable character that can be -// contained in a filtering rule. -func isNotPrintable(r rune) (ok bool) { - // Tab isn't included into Unicode's graphic symbols, so include it here - // explicitly. - return r != '\t' && !unicode.IsGraphic(r) +// likelyBinary returns true if b is likely to be a byte from a binary file. +func likelyBinary(b byte) (ok bool) { + return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t' } // parseLineTitle is like [parseLine] but additionally looks for a title. line // is assumed to be trimmed of whitespace characters. -func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) { +func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) { if len(line) == 0 || line[0] == '#' { return -1, false } if line[0] != '!' { - nonPrintIdx = bytes.IndexFunc(line, isNotPrintable) + badIdx = slices.IndexFunc(line, likelyBinary) - return nonPrintIdx, nonPrintIdx == -1 + return badIdx, badIdx == -1 } const titlePattern = "! Title: " diff --git a/internal/filtering/rulelist/parser_test.go b/internal/filtering/rulelist/parser_test.go index c04d67ca..3ca3565d 100644 --- a/internal/filtering/rulelist/parser_test.go +++ b/internal/filtering/rulelist/parser_test.go @@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) { wantTitle: "Test Title", wantRulesNum: 1, wantWritten: len(testRuleTextBlocked), + }, { + name: "cosmetic_with_zwnj", + in: testRuleTextCosmetic, + wantDst: testRuleTextCosmetic, + wantErrMsg: "", + wantTitle: "", + wantRulesNum: 1, + wantWritten: len(testRuleTextCosmetic), }, { name: "bad_char", in: "! Title: Test Title \n" + @@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) { wantDst: testRuleTextBlocked, wantErrMsg: "line 3: " + "character 4: " + - "non-printable character '\\x7f'", + "likely binary character '\\x7f'", wantTitle: "Test Title", wantRulesNum: 1, wantWritten: len(testRuleTextBlocked), @@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) { require.NoError(b, errSink) require.NotNil(b, resSink) + + // Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU: + // + // goos: linux + // goarch: amd64 + // pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist + // cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics + // BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op } func FuzzParser_Parse(f *testing.F) { @@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) { "! Comment", "! Title ", "! Title XXX", + testRuleTextBadTab, + testRuleTextBlocked, + testRuleTextCosmetic, testRuleTextEtcHostsTab, testRuleTextHTML, - testRuleTextBlocked, - testRuleTextBadTab, "1.2.3.4", "1.2.3.4 etc-hosts.example", ">>>\x00<<<", ">>>\x7F<<<", - strings.Repeat("a", n+1), + strings.Repeat("a", rulelist.DefaultRuleBufSize+1), + strings.Repeat("a", bufio.MaxScanTokenSize+1), } for _, tc := range testCases { diff --git a/internal/filtering/rulelist/rulelist_test.go b/internal/filtering/rulelist/rulelist_test.go index 0c3a3b84..aec6f33b 100644 --- a/internal/filtering/rulelist/rulelist_test.go +++ b/internal/filtering/rulelist/rulelist_test.go @@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second // Common texts for tests. const ( - testRuleTextHTML = "\n" - testRuleTextBlocked = "||blocked.example^\n" testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n" + testRuleTextBlocked = "||blocked.example^\n" testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n" + testRuleTextHTML = "\n" + + // testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner. + // + // See https://github.com/AdguardTeam/AdGuardHome/issues/6003. + testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n" )