From 2adc8624c0bd589a9efab564297bab77dde17ac8 Mon Sep 17 00:00:00 2001
From: Ainar Garipov <a.garipov@adguard.com>
Date: Thu, 13 Jul 2023 19:43:53 +0300
Subject: [PATCH] Pull request 1924: 6003-relax-rule-validation

Updates #6003.

Squashed commit of the following:

commit 1874860877662999d158631e3a25f8072c24f155
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:36:26 2023 +0300

    filtering/rulelist: imp test

commit 871a41af8039bf4d4fb139622d4296bcaff6729c
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:10:35 2023 +0300

    filtering/rulelist: relax validation
---
 CHANGELOG.md                                 |  5 +--
 internal/filtering/rulelist/parser.go        | 35 ++++++++------------
 internal/filtering/rulelist/parser_test.go   | 26 ++++++++++++---
 internal/filtering/rulelist/rulelist_test.go |  9 +++--
 4 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 851caebd..fb6c12b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.
 
 ### Fixed
 
-- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
-  with lines over 1024 bytes long ([#6003]).
+- `bufio.Scanner: token too long` and other errors when trying to add
+  filtering-rule lists with lines over 1024 bytes long or containing cosmetic
+  rules ([#6003]).
 
 ### Removed
 
diff --git a/internal/filtering/rulelist/parser.go b/internal/filtering/rulelist/parser.go
index e98567d8..24d19b9c 100644
--- a/internal/filtering/rulelist/parser.go
+++ b/internal/filtering/rulelist/parser.go
@@ -6,10 +6,9 @@ import (
 	"fmt"
 	"hash/crc32"
 	"io"
-	"unicode"
-	"unicode/utf8"
 
 	"github.com/AdguardTeam/golibs/errors"
+	"golang.org/x/exp/slices"
 )
 
 // Parser is a filtering-rule parser that collects data, such as the checksum
@@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
 		badIdx, isRule = p.parseLineTitle(trimmed)
 	}
 	if badIdx != -1 {
-		badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
-
 		return 0, fmt.Errorf(
-			"line %d: character %d: non-printable character %q",
+			"line %d: character %d: likely binary character %q",
 			lineNum,
 			badIdx+bytes.Index(line, trimmed)+1,
-			badRune,
+			trimmed[badIdx],
 		)
 	}
 
@@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
 }
 
 // parseLine returns true if the parsed line is a filtering rule.  line is
-// assumed to be trimmed of whitespace characters.  nonPrintIdx is the index of
-// the first non-printable character, if any; if there are none, nonPrintIdx is
-// -1.
+// assumed to be trimmed of whitespace characters.  badIdx is the index of the
+// first character that may indicate that this is a binary file, or -1 if none.
 //
 // A line is considered a rule if it's not empty, not a comment, and contains
 // only printable characters.
-func parseLine(line []byte) (nonPrintIdx int, isRule bool) {
+func parseLine(line []byte) (badIdx int, isRule bool) {
 	if len(line) == 0 || line[0] == '#' || line[0] == '!' {
 		return -1, false
 	}
 
-	nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
+	badIdx = slices.IndexFunc(line, likelyBinary)
 
-	return nonPrintIdx, nonPrintIdx == -1
+	return badIdx, badIdx == -1
 }
 
-// isNotPrintable returns true if r is not a printable character that can be
-// contained in a filtering rule.
-func isNotPrintable(r rune) (ok bool) {
-	// Tab isn't included into Unicode's graphic symbols, so include it here
-	// explicitly.
-	return r != '\t' && !unicode.IsGraphic(r)
+// likelyBinary returns true if b is likely to be a byte from a binary file.
+func likelyBinary(b byte) (ok bool) {
+	return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
 }
 
 // parseLineTitle is like [parseLine] but additionally looks for a title.  line
 // is assumed to be trimmed of whitespace characters.
-func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) {
+func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
 	if len(line) == 0 || line[0] == '#' {
 		return -1, false
 	}
 
 	if line[0] != '!' {
-		nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
+		badIdx = slices.IndexFunc(line, likelyBinary)
 
-		return nonPrintIdx, nonPrintIdx == -1
+		return badIdx, badIdx == -1
 	}
 
 	const titlePattern = "! Title: "
diff --git a/internal/filtering/rulelist/parser_test.go b/internal/filtering/rulelist/parser_test.go
index c04d67ca..3ca3565d 100644
--- a/internal/filtering/rulelist/parser_test.go
+++ b/internal/filtering/rulelist/parser_test.go
@@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
 		wantTitle:    "Test Title",
 		wantRulesNum: 1,
 		wantWritten:  len(testRuleTextBlocked),
+	}, {
+		name:         "cosmetic_with_zwnj",
+		in:           testRuleTextCosmetic,
+		wantDst:      testRuleTextCosmetic,
+		wantErrMsg:   "",
+		wantTitle:    "",
+		wantRulesNum: 1,
+		wantWritten:  len(testRuleTextCosmetic),
 	}, {
 		name: "bad_char",
 		in: "! Title:  Test Title \n" +
@@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
 		wantDst: testRuleTextBlocked,
 		wantErrMsg: "line 3: " +
 			"character 4: " +
-			"non-printable character '\\x7f'",
+			"likely binary character '\\x7f'",
 		wantTitle:    "Test Title",
 		wantRulesNum: 1,
 		wantWritten:  len(testRuleTextBlocked),
@@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {
 
 	require.NoError(b, errSink)
 	require.NotNil(b, resSink)
+
+	// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
+	//
+	//	goos: linux
+	//	goarch: amd64
+	//	pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
+	//	cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
+	//	BenchmarkParser_Parse-16        100000000              128.0 ns/op            48 B/op          1 allocs/op
 }
 
 func FuzzParser_Parse(f *testing.F) {
@@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
 		"! Comment",
 		"! Title ",
 		"! Title XXX",
+		testRuleTextBadTab,
+		testRuleTextBlocked,
+		testRuleTextCosmetic,
 		testRuleTextEtcHostsTab,
 		testRuleTextHTML,
-		testRuleTextBlocked,
-		testRuleTextBadTab,
 		"1.2.3.4",
 		"1.2.3.4 etc-hosts.example",
 		">>>\x00<<<",
 		">>>\x7F<<<",
-		strings.Repeat("a", n+1),
+		strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
+		strings.Repeat("a", bufio.MaxScanTokenSize+1),
 	}
 
 	for _, tc := range testCases {
diff --git a/internal/filtering/rulelist/rulelist_test.go b/internal/filtering/rulelist/rulelist_test.go
index 0c3a3b84..aec6f33b 100644
--- a/internal/filtering/rulelist/rulelist_test.go
+++ b/internal/filtering/rulelist/rulelist_test.go
@@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second
 
 // Common texts for tests.
 const (
-	testRuleTextHTML        = "<!DOCTYPE html>\n"
-	testRuleTextBlocked     = "||blocked.example^\n"
 	testRuleTextBadTab      = "||bad-tab-and-comment.example^\t# A comment.\n"
+	testRuleTextBlocked     = "||blocked.example^\n"
 	testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
+	testRuleTextHTML        = "<!DOCTYPE html>\n"
+
+	// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
+	//
+	// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
+	testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
 )