diff --git a/go.mod b/go.mod index 990ef309..3c4ac5a1 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/NYTimes/gziphandler v1.1.1 github.com/ameshkov/dnscrypt/v2 v2.2.7 github.com/bluele/gcache v0.0.2 + github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b github.com/digineo/go-ipset/v2 v2.2.1 github.com/dimfeld/httptreemux/v5 v5.5.0 github.com/fsnotify/fsnotify v1.7.0 @@ -16,7 +17,7 @@ require ( github.com/google/go-cmp v0.6.0 github.com/google/gopacket v1.1.19 github.com/google/renameio/v2 v2.0.0 - github.com/google/uuid v1.4.0 + github.com/google/uuid v1.5.0 github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2 github.com/josharian/native v1.1.1-0.20230202152459-5c7d0dd6ab86 github.com/kardianos/service v1.2.2 diff --git a/go.sum b/go.sum index d33b4e08..0c53e35e 100644 --- a/go.sum +++ b/go.sum @@ -18,6 +18,8 @@ github.com/beefsack/go-rate v0.0.0-20220214233405-116f4ca011a0 h1:0b2vaepXIfMsG+ github.com/beefsack/go-rate v0.0.0-20220214233405-116f4ca011a0/go.mod h1:6YNgTHLutezwnBvyneBbwvB8C82y3dcoOj5EQJIdGXA= github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw= github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0= +github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b h1:6+ZFm0flnudZzdSE0JxlhR2hKnGPcNB35BjQf4RYQDY= +github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -46,8 +48,8 @@ github.com/google/pprof v0.0.0-20231205033806-a5a03c77bf08/go.mod h1:czg5+yv1E0Z github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg= github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4= github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= -github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= +github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hugelgupf/socketpair v0.0.0-20190730060125-05d35a94e714 h1:/jC7qQFrv8CrSJVmaolDVOxTfS9kc36uB6H40kdbQq8= github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2 h1:9K06NfxkBh25x56yVhWWlKFE8YpicaSfHwoV8SFbueA= github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2/go.mod h1:3A9PQ1cunSDF/1rbTq99Ts4pVnycWg+vlPkfeD2NLFI= diff --git a/internal/filtering/http.go b/internal/filtering/http.go index ca6b8cf9..362ef0e4 100644 --- a/internal/filtering/http.go +++ b/internal/filtering/http.go @@ -24,23 +24,25 @@ func validateFilterURL(urlStr string) (err error) { if filepath.IsAbs(urlStr) { _, err = os.Stat(urlStr) - if err != nil { - // Don't wrap the error since it's informative enough as is. - return err - } - return nil + // Don't wrap the error since it's informative enough as is. + return err } u, err := url.ParseRequestURI(urlStr) if err != nil { // Don't wrap the error since it's informative enough as is. return err - } else if s := u.Scheme; s != aghhttp.SchemeHTTP && s != aghhttp.SchemeHTTPS { + } + + if s := u.Scheme; s != aghhttp.SchemeHTTP && s != aghhttp.SchemeHTTPS { return &url.Error{ Op: "Check scheme", URL: urlStr, - Err: fmt.Errorf("only %v allowed", []string{aghhttp.SchemeHTTP, aghhttp.SchemeHTTPS}), + Err: fmt.Errorf("only %v allowed", []string{ + aghhttp.SchemeHTTP, + aghhttp.SchemeHTTPS, + }), } } diff --git a/internal/filtering/rulelist/filter.go b/internal/filtering/rulelist/filter.go new file mode 100644 index 00000000..278eef5c --- /dev/null +++ b/internal/filtering/rulelist/filter.go @@ -0,0 +1,338 @@ +package rulelist + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "time" + + "github.com/AdguardTeam/AdGuardHome/internal/aghrenameio" + "github.com/AdguardTeam/golibs/errors" + "github.com/AdguardTeam/golibs/ioutil" + "github.com/AdguardTeam/golibs/log" + "github.com/AdguardTeam/urlfilter/filterlist" + "github.com/c2h5oh/datasize" +) + +// Filter contains information about a single rule-list filter. +// +// TODO(a.garipov): Use. +type Filter struct { + // url is the URL of this rule list. Supported schemes are: + // - http + // - https + // - file + url *url.URL + + // ruleList is the last successfully compiled [filterlist.RuleList]. + ruleList filterlist.RuleList + + // updated is the time of the last successful update. + updated time.Time + + // name is the human-readable name of this rule-list filter. + name string + + // uid is the unique ID of this rule-list filter. + uid UID + + // urlFilterID is used for working with package urlfilter. + urlFilterID URLFilterID + + // rulesCount contains the number of rules in this rule-list filter. + rulesCount int + + // checksum is a CRC32 hash used to quickly check if the rules within a list + // file have changed. + checksum uint32 + + // enabled, if true, means that this rule-list filter is used for filtering. + // + // TODO(a.garipov): Take into account. + enabled bool +} + +// FilterConfig contains the configuration for a [Filter]. +type FilterConfig struct { + // URL is the URL of this rule-list filter. Supported schemes are: + // - http + // - https + // - file + URL *url.URL + + // Name is the human-readable name of this rule-list filter. If not set, it + // is either taken from the rule-list data or generated synthetically from + // the UID. + Name string + + // UID is the unique ID of this rule-list filter. + UID UID + + // URLFilterID is used for working with package urlfilter. + URLFilterID URLFilterID + + // Enabled, if true, means that this rule-list filter is used for filtering. + Enabled bool +} + +// NewFilter creates a new rule-list filter. The filter is not refreshed, so a +// refresh should be performed before use. +func NewFilter(c *FilterConfig) (f *Filter, err error) { + if c.URL == nil { + return nil, errors.Error("no url") + } + + switch s := c.URL.Scheme; s { + case "http", "https", "file": + // Go on. + default: + return nil, fmt.Errorf("bad url scheme: %q", s) + } + + return &Filter{ + url: c.URL, + name: c.Name, + uid: c.UID, + urlFilterID: c.URLFilterID, + enabled: c.Enabled, + }, nil +} + +// Refresh updates the data in the rule-list filter. parseBuf is the initial +// buffer used to parse information from the data. cli and maxSize are only +// used when f is a URL-based list. +func (f *Filter) Refresh( + ctx context.Context, + parseBuf []byte, + cli *http.Client, + cacheDir string, + maxSize datasize.ByteSize, +) (parseRes *ParseResult, err error) { + cachePath := filepath.Join(cacheDir, f.uid.String()+".txt") + + switch s := f.url.Scheme; s { + case "http", "https": + parseRes, err = f.setFromHTTP(ctx, parseBuf, cli, cachePath, maxSize.Bytes()) + case "file": + parseRes, err = f.setFromFile(parseBuf, f.url.Path, cachePath) + default: + // Since the URL has been prevalidated in New, consider this a + // programmer error. + panic(fmt.Errorf("bad url scheme: %q", s)) + } + if err != nil { + // Don't wrap the error, because it's informative enough as is. + return nil, err + } + + if f.checksum != parseRes.Checksum { + f.checksum = parseRes.Checksum + f.rulesCount = parseRes.RulesCount + f.setName(parseRes.Title) + f.updated = time.Now() + } + + return parseRes, nil +} + +// setFromHTTP sets the rule-list filter's data from its URL. It also caches +// the data into a file. +func (f *Filter) setFromHTTP( + ctx context.Context, + parseBuf []byte, + cli *http.Client, + cachePath string, + maxSize uint64, +) (parseRes *ParseResult, err error) { + defer func() { err = errors.Annotate(err, "setting from http: %w") }() + + text, parseRes, err := f.readFromHTTP(ctx, parseBuf, cli, cachePath, maxSize) + if err != nil { + // Don't wrap the error, because it's informative enough as is. + return nil, err + } + + // TODO(a.garipov): Add filterlist.BytesRuleList. + f.ruleList = &filterlist.StringRuleList{ + ID: f.urlFilterID, + RulesText: text, + IgnoreCosmetic: true, + } + + return parseRes, nil +} + +// readFromHTTP reads the data from the rule-list filter's URL into the cache +// file as well as returns it as a string. The data is filtered through a +// parser and so is free from comments, unnecessary whitespace, etc. +func (f *Filter) readFromHTTP( + ctx context.Context, + parseBuf []byte, + cli *http.Client, + cachePath string, + maxSize uint64, +) (text string, parseRes *ParseResult, err error) { + urlStr := f.url.String() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil) + if err != nil { + return "", nil, fmt.Errorf("making request for http url %q: %w", urlStr, err) + } + + resp, err := cli.Do(req) + if err != nil { + return "", nil, fmt.Errorf("requesting from http url: %w", err) + } + defer func() { err = errors.WithDeferred(err, resp.Body.Close()) }() + + // TODO(a.garipov): Use [agdhttp.CheckStatus] when it's moved to golibs. + if resp.StatusCode != http.StatusOK { + return "", nil, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK) + } + + fltFile, err := aghrenameio.NewPendingFile(cachePath, 0o644) + if err != nil { + return "", nil, fmt.Errorf("creating temp file: %w", err) + } + defer func() { err = aghrenameio.WithDeferredCleanup(err, fltFile) }() + + buf := &bytes.Buffer{} + mw := io.MultiWriter(buf, fltFile) + + parser := NewParser() + httpBody := ioutil.LimitReader(resp.Body, maxSize) + parseRes, err = parser.Parse(mw, httpBody, parseBuf) + if err != nil { + return "", nil, fmt.Errorf("parsing response from http url %q: %w", urlStr, err) + } + + return buf.String(), parseRes, nil +} + +// setName sets the title using either the already-present name, the given title +// from the rule-list data, or a synthetic name. +func (f *Filter) setName(title string) { + if f.name != "" { + return + } + + if title != "" { + f.name = title + + return + } + + f.name = fmt.Sprintf("List %s", f.uid) +} + +// setFromFile sets the rule-list filter's data from a file path. It also +// caches the data into a file. +// +// TODO(a.garipov): Retest on Windows once rule-list updater is committed. See +// if calling Close is necessary here. +func (f *Filter) setFromFile( + parseBuf []byte, + filePath string, + cachePath string, +) (parseRes *ParseResult, err error) { + defer func() { err = errors.Annotate(err, "setting from file: %w") }() + + parseRes, err = parseIntoCache(parseBuf, filePath, cachePath) + if err != nil { + // Don't wrap the error, because it's informative enough as is. + return nil, err + } + + err = f.Close() + if err != nil { + return nil, fmt.Errorf("closing old rule list: %w", err) + } + + rl, err := filterlist.NewFileRuleList(f.urlFilterID, cachePath, true) + if err != nil { + return nil, fmt.Errorf("opening new rule list: %w", err) + } + + f.ruleList = rl + + return parseRes, nil +} + +// parseIntoCache copies the relevant the data from filePath into cachePath +// while also parsing it. +func parseIntoCache( + parseBuf []byte, + filePath string, + cachePath string, +) (parseRes *ParseResult, err error) { + tmpFile, err := aghrenameio.NewPendingFile(cachePath, 0o644) + if err != nil { + return nil, fmt.Errorf("creating temp file: %w", err) + } + defer func() { err = aghrenameio.WithDeferredCleanup(err, tmpFile) }() + + // #nosec G304 -- Assume that cachePath is always cacheDir joined with a + // uid using [filepath.Join]. + f, err := os.Open(filePath) + if err != nil { + return nil, fmt.Errorf("opening src file: %w", err) + } + defer func() { err = errors.WithDeferred(err, f.Close()) }() + + parser := NewParser() + parseRes, err = parser.Parse(tmpFile, f, parseBuf) + if err != nil { + return nil, fmt.Errorf("copying src file: %w", err) + } + + return parseRes, nil +} + +// Close closes the underlying rule list. +func (f *Filter) Close() (err error) { + if f.ruleList == nil { + return nil + } + + return f.ruleList.Close() +} + +// filterUpdate represents a single ongoing rule-list filter update. +// +//lint:ignore U1000 TODO(a.garipov): Use. +type filterUpdate struct { + httpCli *http.Client + cacheDir string + name string + parseBuf []byte + maxSize datasize.ByteSize +} + +// process runs an update of a single rule-list. +func (u *filterUpdate) process(ctx context.Context, f *Filter) (err error) { + prevChecksum := f.checksum + parseRes, err := f.Refresh(ctx, u.parseBuf, u.httpCli, u.cacheDir, u.maxSize) + if err != nil { + return fmt.Errorf("updating %s: %w", f.uid, err) + } + + if prevChecksum == parseRes.Checksum { + log.Info("filtering: filter %q: filter %q: no change", u.name, f.uid) + + return nil + } + + log.Info( + "filtering: updated filter %q: filter %q: %d bytes, %d rules", + u.name, + f.uid, + parseRes.BytesWritten, + parseRes.RulesCount, + ) + + return nil +} diff --git a/internal/filtering/rulelist/filter_test.go b/internal/filtering/rulelist/filter_test.go new file mode 100644 index 00000000..93cd6e9c --- /dev/null +++ b/internal/filtering/rulelist/filter_test.go @@ -0,0 +1,107 @@ +package rulelist_test + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "net/url" + "os" + "path/filepath" + "testing" + + "github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist" + "github.com/AdguardTeam/golibs/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFilter_Refresh(t *testing.T) { + cacheDir := t.TempDir() + uid := rulelist.MustNewUID() + + initialFile := filepath.Join(cacheDir, "initial.txt") + initialData := []byte( + testRuleTextTitle + + testRuleTextBlocked, + ) + writeErr := os.WriteFile(initialFile, initialData, 0o644) + require.NoError(t, writeErr) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + pt := testutil.PanicT{} + + _, err := io.WriteString(w, testRuleTextTitle+testRuleTextBlocked) + require.NoError(pt, err) + })) + + srvURL, urlErr := url.Parse(srv.URL) + require.NoError(t, urlErr) + + testCases := []struct { + url *url.URL + name string + wantNewErrMsg string + }{{ + url: nil, + name: "nil_url", + wantNewErrMsg: "no url", + }, { + url: &url.URL{ + Scheme: "ftp", + }, + name: "bad_scheme", + wantNewErrMsg: `bad url scheme: "ftp"`, + }, { + name: "file", + url: &url.URL{ + Scheme: "file", + Path: initialFile, + }, + wantNewErrMsg: "", + }, { + name: "http", + url: srvURL, + wantNewErrMsg: "", + }} + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + f, err := rulelist.NewFilter(&rulelist.FilterConfig{ + URL: tc.url, + Name: tc.name, + UID: uid, + URLFilterID: testURLFilterID, + Enabled: true, + }) + if tc.wantNewErrMsg != "" { + assert.EqualError(t, err, tc.wantNewErrMsg) + + return + } + + testutil.CleanupAndRequireSuccess(t, f.Close) + + require.NotNil(t, f) + + ctx, cancel := context.WithTimeout(context.Background(), testTimeout) + t.Cleanup(cancel) + + buf := make([]byte, rulelist.DefaultRuleBufSize) + cli := &http.Client{ + Timeout: testTimeout, + } + + res, err := f.Refresh(ctx, buf, cli, cacheDir, rulelist.DefaultMaxRuleListSize) + require.NoError(t, err) + + assert.Equal(t, testTitle, res.Title) + assert.Equal(t, len(testRuleTextBlocked), res.BytesWritten) + assert.Equal(t, 1, res.RulesCount) + + // Check that the cached file exists. + _, err = os.Stat(filepath.Join(cacheDir, uid.String()+".txt")) + require.NoError(t, err) + }) + } +} diff --git a/internal/filtering/rulelist/parser_test.go b/internal/filtering/rulelist/parser_test.go index 5554458d..45a8e465 100644 --- a/internal/filtering/rulelist/parser_test.go +++ b/internal/filtering/rulelist/parser_test.go @@ -69,12 +69,12 @@ func TestParser_Parse(t *testing.T) { wantWritten: len(testRuleTextBlocked) + len(testRuleTextHTML), }, { name: "title", - in: "! Title: Test Title \n" + + in: testRuleTextTitle + "! Title: Bad, Ignored Title\n" + testRuleTextBlocked, wantDst: testRuleTextBlocked, wantErrMsg: "", - wantTitle: "Test Title", + wantTitle: testTitle, wantRulesNum: 1, wantWritten: len(testRuleTextBlocked), }, { @@ -87,14 +87,14 @@ func TestParser_Parse(t *testing.T) { wantWritten: len(testRuleTextCosmetic), }, { name: "bad_char", - in: "! Title: Test Title \n" + + in: testRuleTextTitle + testRuleTextBlocked + ">>>\x7F<<<", wantDst: testRuleTextBlocked, wantErrMsg: "line 3: " + "character 4: " + "likely binary character '\\x7f'", - wantTitle: "Test Title", + wantTitle: testTitle, wantRulesNum: 1, wantWritten: len(testRuleTextBlocked), }, { diff --git a/internal/filtering/rulelist/rulelist.go b/internal/filtering/rulelist/rulelist.go index 464650a1..e0fd61b4 100644 --- a/internal/filtering/rulelist/rulelist.go +++ b/internal/filtering/rulelist/rulelist.go @@ -1,9 +1,55 @@ // Package rulelist contains the implementation of the standard rule-list // filter that wraps an urlfilter filtering-engine. // -// TODO(a.garipov): Expand. +// TODO(a.garipov): Add a new update worker. package rulelist +import ( + "fmt" + + "github.com/c2h5oh/datasize" + "github.com/google/uuid" +) + // DefaultRuleBufSize is the default length of a buffer used to read a line with // a filtering rule, in bytes. +// +// TODO(a.garipov): Consider using [datasize.ByteSize]. It is currently only +// used as an int. const DefaultRuleBufSize = 1024 + +// DefaultMaxRuleListSize is the default maximum filtering-rule list size. +const DefaultMaxRuleListSize = 64 * datasize.MB + +// URLFilterID is a semantic type-alias for IDs used for working with package +// urlfilter. +type URLFilterID = int + +// UID is the type for the unique IDs of filtering-rule lists. +type UID uuid.UUID + +// NewUID returns a new filtering-rule list UID. Any error returned is an error +// from the cryptographic randomness reader. +func NewUID() (uid UID, err error) { + uuidv7, err := uuid.NewV7() + + return UID(uuidv7), err +} + +// MustNewUID is a wrapper around [NewUID] that panics if there is an error. +func MustNewUID() (uid UID) { + uid, err := NewUID() + if err != nil { + panic(fmt.Errorf("unexpected uuidv7 error: %w", err)) + } + + return uid +} + +// type check +var _ fmt.Stringer = UID{} + +// String implements the [fmt.Stringer] interface for UID. +func (id UID) String() (s string) { + return uuid.UUID(id).String() +} diff --git a/internal/filtering/rulelist/rulelist_test.go b/internal/filtering/rulelist/rulelist_test.go index aec6f33b..dc79d503 100644 --- a/internal/filtering/rulelist/rulelist_test.go +++ b/internal/filtering/rulelist/rulelist_test.go @@ -1,16 +1,34 @@ package rulelist_test -import "time" +import ( + "testing" + "time" + + "github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist" + "github.com/AdguardTeam/golibs/testutil" +) + +func TestMain(m *testing.M) { + testutil.DiscardLogOutput(m) +} // testTimeout is the common timeout for tests. const testTimeout = 1 * time.Second -// Common texts for tests. +// testURLFilterID is the common [rulelist.URLFilterID] for tests. +const testURLFilterID rulelist.URLFilterID = 1 + +// testTitle is the common title for tests. +const testTitle = "Test Title" + +// Common rule texts for tests. const ( testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n" testRuleTextBlocked = "||blocked.example^\n" + testRuleTextBlocked2 = "||blocked-2.example^\n" testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n" testRuleTextHTML = "\n" + testRuleTextTitle = "! Title: " + testTitle + " \n" // testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner. //