From 21972e49cbbe1d2f1ca1c9033ae9510ba7cf3d35 Mon Sep 17 00:00:00 2001 From: Ainar Garipov Date: Thu, 20 May 2021 13:42:35 +0300 Subject: [PATCH] Pull request: querylog: imp perf Merge in DNS/adguard-home from contains-fold to master Squashed commit of the following: commit 45c79b4b7618c8f3108766cc776b5bd3f0571761 Author: Ainar Garipov Date: Wed May 19 21:26:09 2021 +0300 querylog: imp perf --- internal/querylog/searchcriterion.go | 50 ++++++--- internal/querylog/searchcriterion_test.go | 121 ++++++++++++++++++++++ 2 files changed, 158 insertions(+), 13 deletions(-) create mode 100644 internal/querylog/searchcriterion_test.go diff --git a/internal/querylog/searchcriterion.go b/internal/querylog/searchcriterion.go index c0aca44f..725a36fe 100644 --- a/internal/querylog/searchcriterion.go +++ b/internal/querylog/searchcriterion.go @@ -2,6 +2,8 @@ package querylog import ( "strings" + "unicode" + "unicode/utf8" "github.com/AdguardTeam/AdGuardHome/internal/dnsfilter" ) @@ -63,6 +65,37 @@ func (c *searchCriterion) ctDomainOrClientCaseStrict( strings.EqualFold(name, term) } +// containsFold reports whehter s contains, ignoring letter case, substr. +// +// TODO(a.garipov): Move to aghstrings if needed elsewhere. +func containsFold(s, substr string) (ok bool) { + sLen, substrLen := len(s), len(substr) + if sLen < substrLen { + return false + } + + if sLen == substrLen { + return strings.EqualFold(s, substr) + } + + first, _ := utf8.DecodeRuneInString(substr) + firstFolded := unicode.SimpleFold(first) + + for i := 0; i != -1 && len(s) >= len(substr); { + if strings.EqualFold(s[:substrLen], substr) { + return true + } + + i = strings.IndexFunc(s[1:], func(r rune) (eq bool) { + return r == first || r == firstFolded + }) + + s = s[1+i:] + } + + return false +} + func (c *searchCriterion) ctDomainOrClientCaseNonStrict( term string, clientID string, @@ -70,19 +103,10 @@ func (c *searchCriterion) ctDomainOrClientCaseNonStrict( host string, ip string, ) (ok bool) { - // TODO(a.garipov): Write a performant, case-insensitive version of - // strings.Contains instead of generating garbage. Or, perhaps in the - // future, use a locale-appropriate matcher from golang.org/x/text. - clientID = strings.ToLower(clientID) - host = strings.ToLower(host) - ip = strings.ToLower(ip) - name = strings.ToLower(name) - term = strings.ToLower(term) - - return strings.Contains(clientID, term) || - strings.Contains(host, term) || - strings.Contains(ip, term) || - strings.Contains(name, term) + return containsFold(clientID, term) || + containsFold(host, term) || + containsFold(ip, term) || + containsFold(name, term) } // quickMatch quickly checks if the line matches the given search criterion. diff --git a/internal/querylog/searchcriterion_test.go b/internal/querylog/searchcriterion_test.go new file mode 100644 index 00000000..65ee6645 --- /dev/null +++ b/internal/querylog/searchcriterion_test.go @@ -0,0 +1,121 @@ +package querylog + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestContainsFold(t *testing.T) { + testCases := []struct { + name string + inS string + inSubstr string + want bool + }{{ + name: "empty", + inS: "", + inSubstr: "", + want: true, + }, { + name: "shorter", + inS: "a", + inSubstr: "abc", + want: false, + }, { + name: "same_len_true", + inS: "abc", + inSubstr: "abc", + want: true, + }, { + name: "same_len_true_fold", + inS: "abc", + inSubstr: "aBc", + want: true, + }, { + name: "same_len_false", + inS: "abc", + inSubstr: "def", + want: false, + }, { + name: "longer_true", + inS: "abcdedef", + inSubstr: "def", + want: true, + }, { + name: "longer_false", + inS: "abcded", + inSubstr: "ghi", + want: false, + }, { + name: "longer_true_fold", + inS: "abcdedef", + inSubstr: "dEf", + want: true, + }, { + name: "longer_false_fold", + inS: "abcded", + inSubstr: "gHi", + want: false, + }, { + name: "longer_true_cyr_fold", + inS: "абвгдедеё", + inSubstr: "дЕЁ", + want: true, + }, { + name: "longer_false_cyr_fold", + inS: "абвгдедеё", + inSubstr: "жЗИ", + want: false, + }, { + name: "no_letters_true", + inS: "1.2.3.4", + inSubstr: "2.3.4", + want: true, + }, { + name: "no_letters_false", + inS: "1.2.3.4", + inSubstr: "2.3.5", + want: false, + }} + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if tc.want { + assert.True(t, containsFold(tc.inS, tc.inSubstr)) + } else { + assert.False(t, containsFold(tc.inS, tc.inSubstr)) + } + }) + } +} + +var sink bool + +func BenchmarkContainsFold(b *testing.B) { + const s = "aaahBbBhccchDDDeEehFfFhGGGhHhh" + const substr = "HHH" + + // Compare our implementation of containsFold against a stupid solution + // of calling strings.ToLower and strings.Contains. + b.Run("containsfold", func(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + sink = containsFold(s, substr) + } + + assert.True(b, sink) + }) + + b.Run("tolower_contains", func(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + sink = strings.Contains(strings.ToLower(s), strings.ToLower(substr)) + } + + assert.True(b, sink) + }) +}