Pull request 1921: 6003-relax-scan-limit

Updates #6003. Squashed commit of the following: commit 1cc42303c29edc621802fc182ccb5701e412f099 Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 13:47:41 2023 +0300 all: fix chlog commit e835084c7aac6384ea7b0886e6b3b1d614438baa Author: Ainar Garipov <A.Garipov@AdGuard.COM> Date: Thu Jul 13 13:40:45 2023 +0300 rulelist: imp longer line handling
2023-07-13 13:57:32 +03:00 · 2023-07-13 13:57:32 +03:00 · f22d893845
parent de63eeabfa
commit f22d893845
5 changed files with 53 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -23,11 +23,18 @@ See also the [v0.107.35 GitHub milestone][ms-v0.107.35].
 NOTE: Add new changes BELOW THIS COMMENT.
 -->

+### Fixed
+
+- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
+  with lines over 1024 bytes long ([#6003]).
+
 ### Removed

 - Default exposure of the non-standard ports 784 and 8853 for DNS-over-QUIC in
  the `Dockerfile`.

+[#6003]: https://github.com/AdguardTeam/AdGuardHome/issues/6003
+
 <!--
 NOTE: Add new changes ABOVE THIS COMMENT.
 -->
--- a/internal/filtering/filtering.go
+++ b/internal/filtering/filtering.go
@ -943,7 +943,7 @@ func New(c *Config, blockFilters []Filter) (d *DNSFilter, err error) {
 	d = &DNSFilter{
 		bufPool: &sync.Pool{
 			New: func() (buf any) {
-				bufVal := make([]byte, rulelist.MaxRuleLen)
+				bufVal := make([]byte, rulelist.DefaultRuleBufSize)

 				return &bufVal
 			},
--- a/internal/filtering/rulelist/parser.go
+++ b/internal/filtering/rulelist/parser.go
@ -7,6 +7,7 @@ import (
 	"hash/crc32"
 	"io"
 	"unicode"
+	"unicode/utf8"

 	"github.com/AdguardTeam/golibs/errors"
 )
@ -48,19 +49,29 @@ type ParseResult struct {
 // nil.
 func (p *Parser) Parse(dst io.Writer, src io.Reader, buf []byte) (r *ParseResult, err error) {
 	s := bufio.NewScanner(src)
-	s.Buffer(buf, MaxRuleLen)

-	lineIdx := 0
+	// Don't use [DefaultRuleBufSize] as the maximum size, since some
+	// filtering-rule lists compressed by e.g. HostlistsCompiler can have very
+	// large lines.  The buffer optimization still works for the more common
+	// case of reasonably-sized lines.
+	//
+	// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
+	s.Buffer(buf, bufio.MaxScanTokenSize)
+
+	// Use a one-based index for lines and columns, since these errors end up in
+	// the frontend, and users are more familiar with one-based line and column
+	// indexes.
+	lineNum := 1
 	for s.Scan() {
 		var n int
-		n, err = p.processLine(dst, s.Bytes(), lineIdx)
+		n, err = p.processLine(dst, s.Bytes(), lineNum)
 		p.written += n
 		if err != nil {
 			// Don't wrap the error, because it's informative enough as is.
 			return p.result(), err
 		}

-		lineIdx++
+		lineNum++
 	}

 	r = p.result()
@ -81,7 +92,7 @@ func (p *Parser) result() (r *ParseResult) {

 // processLine processes a single line.  It may write to dst, and if it does, n
 // is the number of bytes written.
-func (p *Parser) processLine(dst io.Writer, line []byte, lineIdx int) (n int, err error) {
+func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, err error) {
 	trimmed := bytes.TrimSpace(line)
 	if p.written == 0 && isHTMLLine(trimmed) {
 		return 0, ErrHTML
@ -94,10 +105,13 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineIdx int) (n int, er
 		badIdx, isRule = p.parseLineTitle(trimmed)
 	}
 	if badIdx != -1 {
+		badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
+
 		return 0, fmt.Errorf(
-			"line at index %d: character at index %d: non-printable character",
-			lineIdx,
-			badIdx+bytes.Index(line, trimmed),
+			"line %d: character %d: non-printable character %q",
+			lineNum,
+			badIdx+bytes.Index(line, trimmed)+1,
+			badRune,
 		)
 	}

--- a/internal/filtering/rulelist/parser_test.go
+++ b/internal/filtering/rulelist/parser_test.go
@ -17,6 +17,9 @@ import (
 func TestParser_Parse(t *testing.T) {
 	t.Parallel()

+	longRule := strings.Repeat("a", rulelist.DefaultRuleBufSize+1) + "\n"
+	tooLongRule := strings.Repeat("a", bufio.MaxScanTokenSize+1) + "\n"
+
 	testCases := []struct {
 		name         string
 		in           string
@ -80,20 +83,28 @@ func TestParser_Parse(t *testing.T) {
 			testRuleTextBlocked +
 			">>>\x7F<<<",
 		wantDst: testRuleTextBlocked,
-		wantErrMsg: "line at index 2: " +
-			"character at index 3: " +
-			"non-printable character",
+		wantErrMsg: "line 3: " +
+			"character 4: " +
+			"non-printable character '\\x7f'",
 		wantTitle:    "Test Title",
 		wantRulesNum: 1,
 		wantWritten:  len(testRuleTextBlocked),
 	}, {
 		name:         "too_long",
-		in:           strings.Repeat("a", rulelist.MaxRuleLen+1),
+		in:           tooLongRule,
 		wantDst:      "",
-		wantErrMsg:   "scanning filter contents: " + bufio.ErrTooLong.Error(),
+		wantErrMsg:   "scanning filter contents: bufio.Scanner: token too long",
 		wantTitle:    "",
 		wantRulesNum: 0,
 		wantWritten:  0,
+	}, {
+		name:         "longer_than_default",
+		in:           longRule,
+		wantDst:      longRule,
+		wantErrMsg:   "",
+		wantTitle:    "",
+		wantRulesNum: 1,
+		wantWritten:  len(longRule),
 	}, {
 		name:         "bad_tab_and_comment",
 		in:           testRuleTextBadTab,
@ -118,7 +129,7 @@ func TestParser_Parse(t *testing.T) {
 			t.Parallel()

 			dst := &bytes.Buffer{}
-			buf := make([]byte, rulelist.MaxRuleLen)
+			buf := make([]byte, rulelist.DefaultRuleBufSize)

 			p := rulelist.NewParser()
 			r, err := p.Parse(dst, strings.NewReader(tc.in), buf)
@ -145,7 +156,7 @@ func TestParser_Parse_writeError(t *testing.T) {
 			return 1, errors.Error("test error")
 		},
 	}
-	buf := make([]byte, rulelist.MaxRuleLen)
+	buf := make([]byte, rulelist.DefaultRuleBufSize)

 	p := rulelist.NewParser()
 	r, err := p.Parse(dst, strings.NewReader(testRuleTextBlocked), buf)
@ -165,7 +176,7 @@ func TestParser_Parse_checksums(t *testing.T) {
 			"# Another comment.\n"
 	)

-	buf := make([]byte, rulelist.MaxRuleLen)
+	buf := make([]byte, rulelist.DefaultRuleBufSize)

 	p := rulelist.NewParser()
 	r, err := p.Parse(&bytes.Buffer{}, strings.NewReader(withoutComments), buf)
@ -192,7 +203,7 @@ var (
 func BenchmarkParser_Parse(b *testing.B) {
 	dst := &bytes.Buffer{}
 	src := strings.NewReader(strings.Repeat(testRuleTextBlocked, 1000))
-	buf := make([]byte, rulelist.MaxRuleLen)
+	buf := make([]byte, rulelist.DefaultRuleBufSize)
 	p := rulelist.NewParser()

 	b.ReportAllocs()
--- a/internal/filtering/rulelist/rulelist.go
+++ b/internal/filtering/rulelist/rulelist.go
@ -4,8 +4,6 @@
 // TODO(a.garipov): Expand.
 package rulelist

-// MaxRuleLen is the maximum length of a line with a filtering rule, in bytes.
-//
-// TODO(a.garipov): Consider changing this to a rune length, like AdGuardDNS
-// does.
-const MaxRuleLen = 1024
+// DefaultRuleBufSize is the default length of a buffer used to read a line with
+// a filtering rule, in bytes.
+const DefaultRuleBufSize = 1024