Pull request: 5258-good-old-filters

Merge in DNS/adguard-home from 5258-good-old-filters to master

Updates #5258.

Squashed commit of the following:

commit 8555e685a104713e552f017de63281749f41b6b2
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Tue Dec 20 16:07:52 2022 +0400

    filtering: imp tests, docs

commit 2ecfc18fc69850a06461620a24527158603cd7b8
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Tue Dec 20 11:00:59 2022 +0400

    filtering: fix docs

commit 1ea8d45a85f3fb6794b44134e8fdcbe2044d2199
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 23:19:37 2022 +0400

    filtering: imp naming, docs

commit c52a3bba48738c002111c234fb4c312380e49cfc
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 23:13:37 2022 +0400

    filtering: imp logic

commit 3ad4276ace40f05db47b49fb033d1b0fa208ec4e
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 17:49:15 2022 +0400

    filtering: imp docs

commit 1bc3cc443bc8ec988532effaaf5f50474a1a69ab
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 17:45:37 2022 +0400

    filtering: imp more

commit 7908339a0c9fcc29e8fe12b6c5d8c14bbfa51364
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 16:57:42 2022 +0400

    filtering: imp code

commit 21bbd18b4ded83f354210ac32010d8fd1073452f
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 12:11:21 2022 +0400

    filtering: imp src reading
This commit is contained in:
Eugene Burkov 2022-12-20 16:40:42 +03:00
parent de08ef0077
commit 48cbc7bdf0
4 changed files with 168 additions and 166 deletions

View File

@ -2,6 +2,7 @@ package filtering
import ( import (
"bufio" "bufio"
"bytes"
"fmt" "fmt"
"hash/crc32" "hash/crc32"
"io" "io"
@ -12,6 +13,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/AdguardTeam/AdGuardHome/internal/aghalg"
"github.com/AdguardTeam/golibs/errors" "github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/log" "github.com/AdguardTeam/golibs/log"
"github.com/AdguardTeam/golibs/stringutil" "github.com/AdguardTeam/golibs/stringutil"
@ -134,8 +136,8 @@ func (d *DNSFilter) filterSetProperties(
// TODO(e.burkov): The validation of the contents of the new URL is // TODO(e.burkov): The validation of the contents of the new URL is
// currently skipped if the rule list is disabled. This makes it // currently skipped if the rule list is disabled. This makes it
// possible to set a bad rules source, but the validation should still // possible to set a bad rules source, but the validation should still
// kick in when the filter is enabled. Consider making changing this // kick in when the filter is enabled. Consider changing this behavior
// behavior to be stricter. // to be stricter.
filt.unload() filt.unload()
} }
@ -269,10 +271,10 @@ func (d *DNSFilter) periodicallyRefreshFilters() {
// already going on. // already going on.
// //
// TODO(e.burkov): Get rid of the concurrency pattern which requires the // TODO(e.burkov): Get rid of the concurrency pattern which requires the
// sync.Mutex.TryLock. // [sync.Mutex.TryLock].
func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) { func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) {
if ok = d.refreshLock.TryLock(); !ok { if ok = d.refreshLock.TryLock(); !ok {
return 0, false, ok return 0, false, false
} }
defer d.refreshLock.Unlock() defer d.refreshLock.Unlock()
@ -427,52 +429,124 @@ func (d *DNSFilter) refreshFiltersIntl(block, allow, force bool) (int, bool) {
return updNum, false return updNum, false
} }
// Allows printable UTF-8 text with CR, LF, TAB characters // isPrintableText returns true if data is printable UTF-8 text with CR, LF, TAB
func isPrintableText(data []byte, len int) bool { // characters.
for i := 0; i < len; i++ { //
c := data[i] // TODO(e.burkov): Investigate the purpose of this and improve the
// implementation. Perhaps, use something from the unicode package.
func isPrintableText(data string) (ok bool) {
for _, c := range []byte(data) {
if (c >= ' ' && c != 0x7f) || c == '\n' || c == '\r' || c == '\t' { if (c >= ' ' && c != 0x7f) || c == '\n' || c == '\r' || c == '\t' {
continue continue
} }
return false return false
} }
return true return true
} }
// A helper function that parses filter contents and returns a number of rules and a filter name (if there's any) // scanLinesWithBreak is essentially a [bufio.ScanLines] which keeps trailing
func (d *DNSFilter) parseFilterContents(file io.Reader) (int, uint32, string) { // line breaks.
rulesCount := 0 func scanLinesWithBreak(data []byte, atEOF bool) (advance int, token []byte, err error) {
name := "" if atEOF && len(data) == 0 {
seenTitle := false return 0, nil, nil
r := bufio.NewReader(file) }
checksum := uint32(0)
if i := bytes.IndexByte(data, '\n'); i >= 0 {
return i + 1, data[0 : i+1], nil
}
if atEOF {
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
// parseFilter copies filter's content from src to dst and returns the number of
// rules, name, number of bytes written, checksum, and title of the parsed list.
// dst must not be nil.
func (d *DNSFilter) parseFilter(
src io.Reader,
dst io.Writer,
) (rulesNum, written int, checksum uint32, title string, err error) {
scanner := bufio.NewScanner(src)
scanner.Split(scanLinesWithBreak)
titleFound := false
for n := 0; scanner.Scan(); written += n {
line := scanner.Text()
var isRule bool
var likelyTitle string
isRule, likelyTitle, err = d.parseFilterLine(line, !titleFound, written == 0)
if err != nil {
return 0, written, 0, "", err
}
if isRule {
rulesNum++
} else if likelyTitle != "" {
title, titleFound = likelyTitle, true
}
for {
line, err := r.ReadString('\n')
checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line)) checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line))
line = strings.TrimSpace(line) n, err = dst.Write([]byte(line))
if len(line) == 0 {
//
} else if line[0] == '!' {
m := d.filterTitleRegexp.FindAllStringSubmatch(line, -1)
if len(m) > 0 && len(m[0]) >= 2 && !seenTitle {
name = m[0][1]
seenTitle = true
}
} else if line[0] == '#' {
//
} else {
rulesCount++
}
if err != nil { if err != nil {
break return 0, written, 0, "", fmt.Errorf("writing filter line: %w", err)
} }
} }
return rulesCount, checksum, name if err = scanner.Err(); err != nil {
return 0, written, 0, "", fmt.Errorf("scanning filter contents: %w", err)
}
return rulesNum, written, checksum, title, nil
}
// parseFilterLine returns true if the passed line is a rule. line is
// considered a rule if it's not a comment and contains no title.
func (d *DNSFilter) parseFilterLine(
line string,
lookForTitle bool,
testHTML bool,
) (isRule bool, title string, err error) {
if !isPrintableText(line) {
return false, "", errors.Error("filter contains non-printable characters")
}
line = strings.TrimSpace(line)
if line == "" || line[0] == '#' {
return false, "", nil
}
if testHTML && isHTML(line) {
return false, "", errors.Error("data is HTML, not plain text")
}
if line[0] == '!' && lookForTitle {
match := d.filterTitleRegexp.FindStringSubmatch(line)
if len(match) > 1 {
title = match[1]
}
return false, title, nil
}
return true, "", nil
}
// isHTML returns true if the line contains HTML tags instead of plain text.
// line shouldn have no leading space symbols.
//
// TODO(ameshkov): It actually gives too much false-positives. Perhaps, just
// check if trimmed string begins with angle bracket.
func isHTML(line string) (ok bool) {
line = strings.ToLower(line)
return strings.HasPrefix(line, "<html") || strings.HasPrefix(line, "<!doctype")
} }
// Perform upgrade on a filter and update LastUpdated value // Perform upgrade on a filter and update LastUpdated value
@ -485,57 +559,10 @@ func (d *DNSFilter) update(filter *FilterYAML) (bool, error) {
log.Error("os.Chtimes(): %v", e) log.Error("os.Chtimes(): %v", e)
} }
} }
return b, err return b, err
} }
func (d *DNSFilter) read(reader io.Reader, tmpFile *os.File, filter *FilterYAML) (int, error) {
htmlTest := true
firstChunk := make([]byte, 4*1024)
firstChunkLen := 0
buf := make([]byte, 64*1024)
total := 0
for {
n, err := reader.Read(buf)
total += n
if htmlTest {
num := len(firstChunk) - firstChunkLen
if n < num {
num = n
}
copied := copy(firstChunk[firstChunkLen:], buf[:num])
firstChunkLen += copied
if firstChunkLen == len(firstChunk) || err == io.EOF {
if !isPrintableText(firstChunk, firstChunkLen) {
return total, fmt.Errorf("data contains non-printable characters")
}
s := strings.ToLower(string(firstChunk))
if strings.Contains(s, "<html") || strings.Contains(s, "<!doctype") {
return total, fmt.Errorf("data is HTML, not plain text")
}
htmlTest = false
firstChunk = nil
}
}
_, err2 := tmpFile.Write(buf[:n])
if err2 != nil {
return total, err2
}
if err == io.EOF {
return total, nil
}
if err != nil {
log.Printf("Couldn't fetch filter contents from URL %s, skipping: %s", filter.URL, err)
return total, err
}
}
}
// finalizeUpdate closes and gets rid of temporary file f with filter's content // finalizeUpdate closes and gets rid of temporary file f with filter's content
// according to updated. It also saves new values of flt's name, rules number // according to updated. It also saves new values of flt's name, rules number
// and checksum if sucсeeded. // and checksum if sucсeeded.
@ -552,7 +579,8 @@ func (d *DNSFilter) finalizeUpdate(
// Close the file before renaming it because it's required on Windows. // Close the file before renaming it because it's required on Windows.
// //
// See https://github.com/adguardTeam/adGuardHome/issues/1553. // See https://github.com/adguardTeam/adGuardHome/issues/1553.
if err = file.Close(); err != nil { err = file.Close()
if err != nil {
return fmt.Errorf("closing temporary file: %w", err) return fmt.Errorf("closing temporary file: %w", err)
} }
@ -564,38 +592,18 @@ func (d *DNSFilter) finalizeUpdate(
log.Printf("saving filter %d contents to: %s", flt.ID, flt.Path(d.DataDir)) log.Printf("saving filter %d contents to: %s", flt.ID, flt.Path(d.DataDir))
if err = os.Rename(tmpFileName, flt.Path(d.DataDir)); err != nil { // Don't use renamio or maybe packages, since those will require loading the
// whole filter content to the memory on Windows.
err = os.Rename(tmpFileName, flt.Path(d.DataDir))
if err != nil {
return errors.WithDeferred(err, os.Remove(tmpFileName)) return errors.WithDeferred(err, os.Remove(tmpFileName))
} }
flt.Name = stringutil.Coalesce(flt.Name, name) flt.Name, flt.checksum, flt.RulesCount = aghalg.Coalesce(flt.Name, name), cs, rnum
flt.checksum = cs
flt.RulesCount = rnum
return nil return nil
} }
// processUpdate copies filter's content from src to dst and returns the name,
// rules number, and checksum for it. It also returns the number of bytes read
// from src.
func (d *DNSFilter) processUpdate(
src io.Reader,
dst *os.File,
flt *FilterYAML,
) (name string, rnum int, cs uint32, n int, err error) {
if n, err = d.read(src, dst, flt); err != nil {
return "", 0, 0, 0, err
}
if _, err = dst.Seek(0, io.SeekStart); err != nil {
return "", 0, 0, 0, err
}
rnum, cs, name = d.parseFilterContents(dst)
return name, rnum, cs, n, nil
}
// updateIntl updates the flt rewriting it's actual file. It returns true if // updateIntl updates the flt rewriting it's actual file. It returns true if
// the actual update has been performed. // the actual update has been performed.
func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) { func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
@ -612,31 +620,21 @@ func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
} }
defer func() { defer func() {
err = errors.WithDeferred(err, d.finalizeUpdate(tmpFile, flt, ok, name, rnum, cs)) err = errors.WithDeferred(err, d.finalizeUpdate(tmpFile, flt, ok, name, rnum, cs))
ok = ok && err == nil if ok && err == nil {
if ok {
log.Printf("updated filter %d: %d bytes, %d rules", flt.ID, n, rnum) log.Printf("updated filter %d: %d bytes, %d rules", flt.ID, n, rnum)
} }
}() }()
// Change the default 0o600 permission to something more acceptable by // Change the default 0o600 permission to something more acceptable by end
// end users. // users.
// //
// See https://github.com/AdguardTeam/AdGuardHome/issues/3198. // See https://github.com/AdguardTeam/AdGuardHome/issues/3198.
if err = tmpFile.Chmod(0o644); err != nil { if err = tmpFile.Chmod(0o644); err != nil {
return false, fmt.Errorf("changing file mode: %w", err) return false, fmt.Errorf("changing file mode: %w", err)
} }
var r io.Reader var rc io.ReadCloser
if filepath.IsAbs(flt.URL) { if !filepath.IsAbs(flt.URL) {
var file io.ReadCloser
file, err = os.Open(flt.URL)
if err != nil {
return false, fmt.Errorf("open file: %w", err)
}
defer func() { err = errors.WithDeferred(err, file.Close()) }()
r = file
} else {
var resp *http.Response var resp *http.Response
resp, err = d.HTTPClient.Get(flt.URL) resp, err = d.HTTPClient.Get(flt.URL)
if err != nil { if err != nil {
@ -649,24 +647,30 @@ func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
log.Printf("got status code %d from %s, skip", resp.StatusCode, flt.URL) log.Printf("got status code %d from %s, skip", resp.StatusCode, flt.URL)
return false, fmt.Errorf("got status code != 200: %d", resp.StatusCode) return false, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK)
} }
r = resp.Body rc = resp.Body
} else {
rc, err = os.Open(flt.URL)
if err != nil {
return false, fmt.Errorf("open file: %w", err)
}
defer func() { err = errors.WithDeferred(err, rc.Close()) }()
} }
name, rnum, cs, n, err = d.processUpdate(r, tmpFile, flt) rnum, n, cs, name, err = d.parseFilter(rc, tmpFile)
return cs != flt.checksum, err return cs != flt.checksum && err == nil, err
} }
// loads filter contents from the file in dataDir // loads filter contents from the file in dataDir
func (d *DNSFilter) load(filter *FilterYAML) (err error) { func (d *DNSFilter) load(flt *FilterYAML) (err error) {
filterFilePath := filter.Path(d.DataDir) fileName := flt.Path(d.DataDir)
log.Tracef("filtering: loading filter %d from %s", filter.ID, filterFilePath) log.Debug("filtering: loading filter %d from %s", flt.ID, fileName)
file, err := os.Open(filterFilePath) file, err := os.Open(fileName)
if errors.Is(err, os.ErrNotExist) { if errors.Is(err, os.ErrNotExist) {
// Do nothing, file doesn't exist. // Do nothing, file doesn't exist.
return nil return nil
@ -680,13 +684,14 @@ func (d *DNSFilter) load(filter *FilterYAML) (err error) {
return fmt.Errorf("getting filter file stat: %w", err) return fmt.Errorf("getting filter file stat: %w", err)
} }
log.Tracef("filtering: File %s, id %d, length %d", filterFilePath, filter.ID, st.Size()) log.Debug("filtering: file %s, id %d, length %d", fileName, flt.ID, st.Size())
rulesCount, checksum, _ := d.parseFilterContents(file) rulesCount, _, checksum, _, err := d.parseFilter(file, io.Discard)
if err != nil {
return fmt.Errorf("parsing filter file: %w", err)
}
filter.RulesCount = rulesCount flt.RulesCount, flt.checksum, flt.LastUpdated = rulesCount, checksum, st.ModTime()
filter.checksum = checksum
filter.LastUpdated = st.ModTime()
return nil return nil
} }

View File

@ -4,33 +4,23 @@ import (
"io/fs" "io/fs"
"net" "net"
"net/http" "net/http"
"net/netip"
"net/url" "net/url"
"os" "os"
"path/filepath" "path/filepath"
"testing" "testing"
"time" "time"
"github.com/AdguardTeam/golibs/netutil" "github.com/AdguardTeam/AdGuardHome/internal/aghhttp"
"github.com/AdguardTeam/golibs/testutil" "github.com/AdguardTeam/golibs/testutil"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
// serveFiltersLocally is a helper that concurrently listens on a free port to // serveHTTPLocally starts a new HTTP server, that handles its index with h. It
// respond with fltContent. It also gracefully closes the listener when the // also gracefully closes the listener when the test under t finishes.
// test under t finishes. func serveHTTPLocally(t *testing.T, h http.Handler) (urlStr string) {
func serveFiltersLocally(t *testing.T, fltContent []byte) (ipp netip.AddrPort) {
t.Helper() t.Helper()
h := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
pt := testutil.PanicT{}
n, werr := w.Write(fltContent)
require.NoError(pt, werr)
require.Equal(pt, len(fltContent), n)
})
l, err := net.Listen("tcp", ":0") l, err := net.Listen("tcp", ":0")
require.NoError(t, err) require.NoError(t, err)
@ -38,9 +28,26 @@ func serveFiltersLocally(t *testing.T, fltContent []byte) (ipp netip.AddrPort) {
testutil.CleanupAndRequireSuccess(t, l.Close) testutil.CleanupAndRequireSuccess(t, l.Close)
addr := l.Addr() addr := l.Addr()
require.IsType(t, new(net.TCPAddr), addr) require.IsType(t, (*net.TCPAddr)(nil), addr)
return netip.AddrPortFrom(netutil.IPv4Localhost(), uint16(addr.(*net.TCPAddr).Port)) return (&url.URL{
Scheme: aghhttp.SchemeHTTP,
Host: addr.String(),
}).String()
}
// serveFiltersLocally is a helper that concurrently listens on a free port to
// respond with fltContent.
func serveFiltersLocally(t *testing.T, fltContent []byte) (urlStr string) {
t.Helper()
return serveHTTPLocally(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
pt := testutil.PanicT{}
n, werr := w.Write(fltContent)
require.NoError(pt, werr)
require.Equal(pt, len(fltContent), n)
}))
} }
func TestFilters(t *testing.T) { func TestFilters(t *testing.T) {
@ -65,10 +72,7 @@ func TestFilters(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
f := &FilterYAML{ f := &FilterYAML{
URL: (&url.URL{ URL: addr,
Scheme: "http",
Host: addr.String(),
}).String(),
} }
updateAndAssert := func(t *testing.T, want require.BoolAssertionFunc, wantRulesCount int) { updateAndAssert := func(t *testing.T, want require.BoolAssertionFunc, wantRulesCount int) {
@ -103,11 +107,7 @@ func TestFilters(t *testing.T) {
anotherContent := []byte(`||example.com^`) anotherContent := []byte(`||example.com^`)
oldURL := f.URL oldURL := f.URL
ipp := serveFiltersLocally(t, anotherContent) f.URL = serveFiltersLocally(t, anotherContent)
f.URL = (&url.URL{
Scheme: "http",
Host: ipp.String(),
}).String()
t.Cleanup(func() { f.URL = oldURL }) t.Cleanup(func() { f.URL = oldURL })
updateAndAssert(t, require.True, 1) updateAndAssert(t, require.True, 1)

View File

@ -190,6 +190,8 @@ type DNSFilter struct {
// filterTitleRegexp is the regular expression to retrieve a name of a // filterTitleRegexp is the regular expression to retrieve a name of a
// filter list. // filter list.
//
// TODO(e.burkov): Don't use regexp for such a simple text processing task.
filterTitleRegexp *regexp.Regexp filterTitleRegexp *regexp.Regexp
hostCheckers []hostChecker hostCheckers []hostChecker

View File

@ -5,7 +5,6 @@ import (
"encoding/json" "encoding/json"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"net/url"
"testing" "testing"
"time" "time"
@ -30,11 +29,7 @@ func TestDNSFilter_handleFilteringSetURL(t *testing.T) {
endpoint: &badRulesEndpoint, endpoint: &badRulesEndpoint,
content: []byte(`<html></html>`), content: []byte(`<html></html>`),
}} { }} {
ipp := serveFiltersLocally(t, rulesSource.content) *rulesSource.endpoint = serveFiltersLocally(t, rulesSource.content)
*rulesSource.endpoint = (&url.URL{
Scheme: "http",
Host: ipp.String(),
}).String()
} }
testCases := []struct { testCases := []struct {