Pull request 1921: 6003-relax-scan-limit

Updates #6003.

Squashed commit of the following:

commit 1cc42303c29edc621802fc182ccb5701e412f099
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 13:47:41 2023 +0300

    all: fix chlog

commit e835084c7aac6384ea7b0886e6b3b1d614438baa
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 13:40:45 2023 +0300

    rulelist: imp longer line handling
This commit is contained in:
Ainar Garipov 2023-07-13 13:57:32 +03:00
parent de63eeabfa
commit f22d893845
5 changed files with 53 additions and 23 deletions

View File

@ -23,11 +23,18 @@ See also the [v0.107.35 GitHub milestone][ms-v0.107.35].
NOTE: Add new changes BELOW THIS COMMENT. NOTE: Add new changes BELOW THIS COMMENT.
--> -->
### Fixed
- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
with lines over 1024 bytes long ([#6003]).
### Removed ### Removed
- Default exposure of the non-standard ports 784 and 8853 for DNS-over-QUIC in - Default exposure of the non-standard ports 784 and 8853 for DNS-over-QUIC in
the `Dockerfile`. the `Dockerfile`.
[#6003]: https://github.com/AdguardTeam/AdGuardHome/issues/6003
<!-- <!--
NOTE: Add new changes ABOVE THIS COMMENT. NOTE: Add new changes ABOVE THIS COMMENT.
--> -->

View File

@ -943,7 +943,7 @@ func New(c *Config, blockFilters []Filter) (d *DNSFilter, err error) {
d = &DNSFilter{ d = &DNSFilter{
bufPool: &sync.Pool{ bufPool: &sync.Pool{
New: func() (buf any) { New: func() (buf any) {
bufVal := make([]byte, rulelist.MaxRuleLen) bufVal := make([]byte, rulelist.DefaultRuleBufSize)
return &bufVal return &bufVal
}, },

View File

@ -7,6 +7,7 @@ import (
"hash/crc32" "hash/crc32"
"io" "io"
"unicode" "unicode"
"unicode/utf8"
"github.com/AdguardTeam/golibs/errors" "github.com/AdguardTeam/golibs/errors"
) )
@ -48,19 +49,29 @@ type ParseResult struct {
// nil. // nil.
func (p *Parser) Parse(dst io.Writer, src io.Reader, buf []byte) (r *ParseResult, err error) { func (p *Parser) Parse(dst io.Writer, src io.Reader, buf []byte) (r *ParseResult, err error) {
s := bufio.NewScanner(src) s := bufio.NewScanner(src)
s.Buffer(buf, MaxRuleLen)
lineIdx := 0 // Don't use [DefaultRuleBufSize] as the maximum size, since some
// filtering-rule lists compressed by e.g. HostlistsCompiler can have very
// large lines. The buffer optimization still works for the more common
// case of reasonably-sized lines.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
s.Buffer(buf, bufio.MaxScanTokenSize)
// Use a one-based index for lines and columns, since these errors end up in
// the frontend, and users are more familiar with one-based line and column
// indexes.
lineNum := 1
for s.Scan() { for s.Scan() {
var n int var n int
n, err = p.processLine(dst, s.Bytes(), lineIdx) n, err = p.processLine(dst, s.Bytes(), lineNum)
p.written += n p.written += n
if err != nil { if err != nil {
// Don't wrap the error, because it's informative enough as is. // Don't wrap the error, because it's informative enough as is.
return p.result(), err return p.result(), err
} }
lineIdx++ lineNum++
} }
r = p.result() r = p.result()
@ -81,7 +92,7 @@ func (p *Parser) result() (r *ParseResult) {
// processLine processes a single line. It may write to dst, and if it does, n // processLine processes a single line. It may write to dst, and if it does, n
// is the number of bytes written. // is the number of bytes written.
func (p *Parser) processLine(dst io.Writer, line []byte, lineIdx int) (n int, err error) { func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, err error) {
trimmed := bytes.TrimSpace(line) trimmed := bytes.TrimSpace(line)
if p.written == 0 && isHTMLLine(trimmed) { if p.written == 0 && isHTMLLine(trimmed) {
return 0, ErrHTML return 0, ErrHTML
@ -94,10 +105,13 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineIdx int) (n int, er
badIdx, isRule = p.parseLineTitle(trimmed) badIdx, isRule = p.parseLineTitle(trimmed)
} }
if badIdx != -1 { if badIdx != -1 {
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
return 0, fmt.Errorf( return 0, fmt.Errorf(
"line at index %d: character at index %d: non-printable character", "line %d: character %d: non-printable character %q",
lineIdx, lineNum,
badIdx+bytes.Index(line, trimmed), badIdx+bytes.Index(line, trimmed)+1,
badRune,
) )
} }

View File

@ -17,6 +17,9 @@ import (
func TestParser_Parse(t *testing.T) { func TestParser_Parse(t *testing.T) {
t.Parallel() t.Parallel()
longRule := strings.Repeat("a", rulelist.DefaultRuleBufSize+1) + "\n"
tooLongRule := strings.Repeat("a", bufio.MaxScanTokenSize+1) + "\n"
testCases := []struct { testCases := []struct {
name string name string
in string in string
@ -80,20 +83,28 @@ func TestParser_Parse(t *testing.T) {
testRuleTextBlocked + testRuleTextBlocked +
">>>\x7F<<<", ">>>\x7F<<<",
wantDst: testRuleTextBlocked, wantDst: testRuleTextBlocked,
wantErrMsg: "line at index 2: " + wantErrMsg: "line 3: " +
"character at index 3: " + "character 4: " +
"non-printable character", "non-printable character '\\x7f'",
wantTitle: "Test Title", wantTitle: "Test Title",
wantRulesNum: 1, wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked), wantWritten: len(testRuleTextBlocked),
}, { }, {
name: "too_long", name: "too_long",
in: strings.Repeat("a", rulelist.MaxRuleLen+1), in: tooLongRule,
wantDst: "", wantDst: "",
wantErrMsg: "scanning filter contents: " + bufio.ErrTooLong.Error(), wantErrMsg: "scanning filter contents: bufio.Scanner: token too long",
wantTitle: "", wantTitle: "",
wantRulesNum: 0, wantRulesNum: 0,
wantWritten: 0, wantWritten: 0,
}, {
name: "longer_than_default",
in: longRule,
wantDst: longRule,
wantErrMsg: "",
wantTitle: "",
wantRulesNum: 1,
wantWritten: len(longRule),
}, { }, {
name: "bad_tab_and_comment", name: "bad_tab_and_comment",
in: testRuleTextBadTab, in: testRuleTextBadTab,
@ -118,7 +129,7 @@ func TestParser_Parse(t *testing.T) {
t.Parallel() t.Parallel()
dst := &bytes.Buffer{} dst := &bytes.Buffer{}
buf := make([]byte, rulelist.MaxRuleLen) buf := make([]byte, rulelist.DefaultRuleBufSize)
p := rulelist.NewParser() p := rulelist.NewParser()
r, err := p.Parse(dst, strings.NewReader(tc.in), buf) r, err := p.Parse(dst, strings.NewReader(tc.in), buf)
@ -145,7 +156,7 @@ func TestParser_Parse_writeError(t *testing.T) {
return 1, errors.Error("test error") return 1, errors.Error("test error")
}, },
} }
buf := make([]byte, rulelist.MaxRuleLen) buf := make([]byte, rulelist.DefaultRuleBufSize)
p := rulelist.NewParser() p := rulelist.NewParser()
r, err := p.Parse(dst, strings.NewReader(testRuleTextBlocked), buf) r, err := p.Parse(dst, strings.NewReader(testRuleTextBlocked), buf)
@ -165,7 +176,7 @@ func TestParser_Parse_checksums(t *testing.T) {
"# Another comment.\n" "# Another comment.\n"
) )
buf := make([]byte, rulelist.MaxRuleLen) buf := make([]byte, rulelist.DefaultRuleBufSize)
p := rulelist.NewParser() p := rulelist.NewParser()
r, err := p.Parse(&bytes.Buffer{}, strings.NewReader(withoutComments), buf) r, err := p.Parse(&bytes.Buffer{}, strings.NewReader(withoutComments), buf)
@ -192,7 +203,7 @@ var (
func BenchmarkParser_Parse(b *testing.B) { func BenchmarkParser_Parse(b *testing.B) {
dst := &bytes.Buffer{} dst := &bytes.Buffer{}
src := strings.NewReader(strings.Repeat(testRuleTextBlocked, 1000)) src := strings.NewReader(strings.Repeat(testRuleTextBlocked, 1000))
buf := make([]byte, rulelist.MaxRuleLen) buf := make([]byte, rulelist.DefaultRuleBufSize)
p := rulelist.NewParser() p := rulelist.NewParser()
b.ReportAllocs() b.ReportAllocs()

View File

@ -4,8 +4,6 @@
// TODO(a.garipov): Expand. // TODO(a.garipov): Expand.
package rulelist package rulelist
// MaxRuleLen is the maximum length of a line with a filtering rule, in bytes. // DefaultRuleBufSize is the default length of a buffer used to read a line with
// // a filtering rule, in bytes.
// TODO(a.garipov): Consider changing this to a rune length, like AdGuardDNS const DefaultRuleBufSize = 1024
// does.
const MaxRuleLen = 1024