AdGuardHome/internal/filtering/rulelist/parser.go

192 lines
5.1 KiB
Go

package rulelist
import (
"bufio"
"bytes"
"fmt"
"hash/crc32"
"io"
"slices"
"github.com/AdguardTeam/golibs/errors"
)
// Parser is a filtering-rule parser that collects data, such as the checksum
// and the title, as well as counts rules and removes comments.
type Parser struct {
title string
rulesCount int
written int
checksum uint32
titleFound bool
}
// NewParser returns a new filtering-rule parser.
func NewParser() (p *Parser) {
return &Parser{}
}
// ParseResult contains information about the results of parsing a
// filtering-rule list by [Parser.Parse].
type ParseResult struct {
// Title is the title contained within the filtering-rule list, if any.
Title string
// RulesCount is the number of rules in the list. It excludes empty lines
// and comments.
RulesCount int
// BytesWritten is the number of bytes written to dst.
BytesWritten int
// Checksum is the CRC-32 checksum of the rules content. That is, excluding
// empty lines and comments.
Checksum uint32
}
// Parse parses data from src into dst using buf during parsing. r is never
// nil.
func (p *Parser) Parse(dst io.Writer, src io.Reader, buf []byte) (r *ParseResult, err error) {
s := bufio.NewScanner(src)
// Don't use [DefaultRuleBufSize] as the maximum size, since some
// filtering-rule lists compressed by e.g. HostlistsCompiler can have very
// large lines. The buffer optimization still works for the more common
// case of reasonably-sized lines.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
s.Buffer(buf, bufio.MaxScanTokenSize)
// Use a one-based index for lines and columns, since these errors end up in
// the frontend, and users are more familiar with one-based line and column
// indexes.
lineNum := 1
for s.Scan() {
var n int
n, err = p.processLine(dst, s.Bytes(), lineNum)
p.written += n
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return p.result(), err
}
lineNum++
}
r = p.result()
err = s.Err()
return r, errors.Annotate(err, "scanning filter contents: %w")
}
// result returns the current parsing result.
func (p *Parser) result() (r *ParseResult) {
return &ParseResult{
Title: p.title,
RulesCount: p.rulesCount,
BytesWritten: p.written,
Checksum: p.checksum,
}
}
// processLine processes a single line. It may write to dst, and if it does, n
// is the number of bytes written.
func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, err error) {
trimmed := bytes.TrimSpace(line)
if p.written == 0 && isHTMLLine(trimmed) {
return 0, ErrHTML
}
badIdx, isRule := 0, false
if p.titleFound {
badIdx, isRule = parseLine(trimmed)
} else {
badIdx, isRule = p.parseLineTitle(trimmed)
}
if badIdx != -1 {
return 0, fmt.Errorf(
"line %d: character %d: likely binary character %q",
lineNum,
badIdx+bytes.Index(line, trimmed)+1,
trimmed[badIdx],
)
}
if !isRule {
return 0, nil
}
p.rulesCount++
p.checksum = crc32.Update(p.checksum, crc32.IEEETable, trimmed)
// Assume that there is generally enough space in the buffer to add a
// newline.
n, err = dst.Write(append(trimmed, '\n'))
return n, errors.Annotate(err, "writing rule line: %w")
}
// isHTMLLine returns true if line is likely an HTML line. line is assumed to
// be trimmed of whitespace characters.
func isHTMLLine(line []byte) (isHTML bool) {
return hasPrefixFold(line, []byte("<html")) || hasPrefixFold(line, []byte("<!doctype"))
}
// hasPrefixFold is a simple, best-effort prefix matcher. It may return
// incorrect results for some non-ASCII characters.
func hasPrefixFold(b, prefix []byte) (ok bool) {
l := len(prefix)
return len(b) >= l && bytes.EqualFold(b[:l], prefix)
}
// parseLine returns true if the parsed line is a filtering rule. line is
// assumed to be trimmed of whitespace characters. badIdx is the index of the
// first character that may indicate that this is a binary file, or -1 if none.
//
// A line is considered a rule if it's not empty, not a comment, and contains
// only printable characters.
func parseLine(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
return -1, false
}
badIdx = slices.IndexFunc(line, likelyBinary)
return badIdx, badIdx == -1
}
// likelyBinary returns true if b is likely to be a byte from a binary file.
func likelyBinary(b byte) (ok bool) {
return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
}
// parseLineTitle is like [parseLine] but additionally looks for a title. line
// is assumed to be trimmed of whitespace characters.
func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' {
return -1, false
}
if line[0] != '!' {
badIdx = slices.IndexFunc(line, likelyBinary)
return badIdx, badIdx == -1
}
const titlePattern = "! Title: "
if !bytes.HasPrefix(line, []byte(titlePattern)) {
return -1, false
}
title := bytes.TrimSpace(line[len(titlePattern):])
if title != nil {
// Note that title can be a non-nil empty slice. Consider that normal
// and just stop looking for other titles.
p.title = string(title)
p.titleFound = true
}
return -1, false
}