AdGuardHome/internal/filtering/rulelist/filter.go

305 lines
7.9 KiB
Go

package rulelist
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"time"
"github.com/AdguardTeam/AdGuardHome/internal/aghrenameio"
"github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/ioutil"
"github.com/AdguardTeam/urlfilter/filterlist"
"github.com/c2h5oh/datasize"
)
// Filter contains information about a single rule-list filter.
//
// TODO(a.garipov): Use.
type Filter struct {
// url is the URL of this rule list. Supported schemes are:
// - http
// - https
// - file
url *url.URL
// ruleList is the last successfully compiled [filterlist.RuleList].
ruleList filterlist.RuleList
// updated is the time of the last successful update.
updated time.Time
// name is the human-readable name of this rule-list filter.
name string
// uid is the unique ID of this rule-list filter.
uid UID
// urlFilterID is used for working with package urlfilter.
urlFilterID URLFilterID
// rulesCount contains the number of rules in this rule-list filter.
rulesCount int
// checksum is a CRC32 hash used to quickly check if the rules within a list
// file have changed.
checksum uint32
// enabled, if true, means that this rule-list filter is used for filtering.
enabled bool
}
// FilterConfig contains the configuration for a [Filter].
type FilterConfig struct {
// URL is the URL of this rule-list filter. Supported schemes are:
// - http
// - https
// - file
URL *url.URL
// Name is the human-readable name of this rule-list filter. If not set, it
// is either taken from the rule-list data or generated synthetically from
// the UID.
Name string
// UID is the unique ID of this rule-list filter.
UID UID
// URLFilterID is used for working with package urlfilter.
URLFilterID URLFilterID
// Enabled, if true, means that this rule-list filter is used for filtering.
Enabled bool
}
// NewFilter creates a new rule-list filter. The filter is not refreshed, so a
// refresh should be performed before use.
func NewFilter(c *FilterConfig) (f *Filter, err error) {
if c.URL == nil {
return nil, errors.Error("no url")
}
switch s := c.URL.Scheme; s {
case "http", "https", "file":
// Go on.
default:
return nil, fmt.Errorf("bad url scheme: %q", s)
}
return &Filter{
url: c.URL,
name: c.Name,
uid: c.UID,
urlFilterID: c.URLFilterID,
enabled: c.Enabled,
}, nil
}
// Refresh updates the data in the rule-list filter. parseBuf is the initial
// buffer used to parse information from the data. cli and maxSize are only
// used when f is a URL-based list.
//
// TODO(a.garipov): Unexport and test in an internal test or through enigne
// tests.
//
// TODO(a.garipov): Consider not returning parseRes.
func (f *Filter) Refresh(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cacheDir string,
maxSize datasize.ByteSize,
) (parseRes *ParseResult, err error) {
cachePath := filepath.Join(cacheDir, f.uid.String()+".txt")
switch s := f.url.Scheme; s {
case "http", "https":
parseRes, err = f.setFromHTTP(ctx, parseBuf, cli, cachePath, maxSize.Bytes())
case "file":
parseRes, err = f.setFromFile(parseBuf, f.url.Path, cachePath)
default:
// Since the URL has been prevalidated in New, consider this a
// programmer error.
panic(fmt.Errorf("bad url scheme: %q", s))
}
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
if f.checksum != parseRes.Checksum {
f.checksum = parseRes.Checksum
f.rulesCount = parseRes.RulesCount
f.setName(parseRes.Title)
f.updated = time.Now()
}
return parseRes, nil
}
// setFromHTTP sets the rule-list filter's data from its URL. It also caches
// the data into a file.
func (f *Filter) setFromHTTP(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cachePath string,
maxSize uint64,
) (parseRes *ParseResult, err error) {
defer func() { err = errors.Annotate(err, "setting from http: %w") }()
text, parseRes, err := f.readFromHTTP(ctx, parseBuf, cli, cachePath, maxSize)
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
// TODO(a.garipov): Add filterlist.BytesRuleList.
f.ruleList = &filterlist.StringRuleList{
ID: f.urlFilterID,
RulesText: text,
IgnoreCosmetic: true,
}
return parseRes, nil
}
// readFromHTTP reads the data from the rule-list filter's URL into the cache
// file as well as returns it as a string. The data is filtered through a
// parser and so is free from comments, unnecessary whitespace, etc.
func (f *Filter) readFromHTTP(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cachePath string,
maxSize uint64,
) (text string, parseRes *ParseResult, err error) {
urlStr := f.url.String()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
if err != nil {
return "", nil, fmt.Errorf("making request for http url %q: %w", urlStr, err)
}
resp, err := cli.Do(req)
if err != nil {
return "", nil, fmt.Errorf("requesting from http url: %w", err)
}
defer func() { err = errors.WithDeferred(err, resp.Body.Close()) }()
// TODO(a.garipov): Use [agdhttp.CheckStatus] when it's moved to golibs.
if resp.StatusCode != http.StatusOK {
return "", nil, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK)
}
fltFile, err := aghrenameio.NewPendingFile(cachePath, 0o644)
if err != nil {
return "", nil, fmt.Errorf("creating temp file: %w", err)
}
defer func() { err = aghrenameio.WithDeferredCleanup(err, fltFile) }()
buf := &bytes.Buffer{}
mw := io.MultiWriter(buf, fltFile)
parser := NewParser()
httpBody := ioutil.LimitReader(resp.Body, maxSize)
parseRes, err = parser.Parse(mw, httpBody, parseBuf)
if err != nil {
return "", nil, fmt.Errorf("parsing response from http url %q: %w", urlStr, err)
}
return buf.String(), parseRes, nil
}
// setName sets the title using either the already-present name, the given title
// from the rule-list data, or a synthetic name.
func (f *Filter) setName(title string) {
if f.name != "" {
return
}
if title != "" {
f.name = title
return
}
f.name = fmt.Sprintf("List %s", f.uid)
}
// setFromFile sets the rule-list filter's data from a file path. It also
// caches the data into a file.
//
// TODO(a.garipov): Retest on Windows once rule-list updater is committed. See
// if calling Close is necessary here.
func (f *Filter) setFromFile(
parseBuf []byte,
filePath string,
cachePath string,
) (parseRes *ParseResult, err error) {
defer func() { err = errors.Annotate(err, "setting from file: %w") }()
parseRes, err = parseIntoCache(parseBuf, filePath, cachePath)
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
err = f.Close()
if err != nil {
return nil, fmt.Errorf("closing old rule list: %w", err)
}
rl, err := filterlist.NewFileRuleList(f.urlFilterID, cachePath, true)
if err != nil {
return nil, fmt.Errorf("opening new rule list: %w", err)
}
f.ruleList = rl
return parseRes, nil
}
// parseIntoCache copies the relevant the data from filePath into cachePath
// while also parsing it.
func parseIntoCache(
parseBuf []byte,
filePath string,
cachePath string,
) (parseRes *ParseResult, err error) {
tmpFile, err := aghrenameio.NewPendingFile(cachePath, 0o644)
if err != nil {
return nil, fmt.Errorf("creating temp file: %w", err)
}
defer func() { err = aghrenameio.WithDeferredCleanup(err, tmpFile) }()
// #nosec G304 -- Assume that cachePath is always cacheDir joined with a
// uid using [filepath.Join].
f, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("opening src file: %w", err)
}
defer func() { err = errors.WithDeferred(err, f.Close()) }()
parser := NewParser()
parseRes, err = parser.Parse(tmpFile, f, parseBuf)
if err != nil {
return nil, fmt.Errorf("copying src file: %w", err)
}
return parseRes, nil
}
// Close closes the underlying rule list.
func (f *Filter) Close() (err error) {
if f.ruleList == nil {
return nil
}
return f.ruleList.Close()
}