From 44d73ce932d24e59df29546635b4952990ae540d Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Tue, 24 Jan 2023 14:31:20 -0500 Subject: [PATCH] ipn/ipnlocal, net/dnscache: allow configuring dnscache logging via capability This allows users to temporarily enable/disable dnscache logging via a new node capability, to aid in debugging strange connectivity issues. Signed-off-by: Andrew Dunham Change-Id: I46cf2596a8ae4c1913880a78d0033f8b668edc08 --- ipn/ipnlocal/local.go | 15 ++++++ net/dnscache/dnscache.go | 97 ++++++++++++++++++++++------------- net/dnscache/dnscache_test.go | 6 ++- tailcfg/tailcfg.go | 10 +++- 4 files changed, 89 insertions(+), 39 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index e87f2365f..a19f708b8 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -45,6 +45,7 @@ import ( "tailscale.com/ipn/ipnstate" "tailscale.com/ipn/policy" "tailscale.com/net/dns" + "tailscale.com/net/dnscache" "tailscale.com/net/dnsfallback" "tailscale.com/net/interfaces" "tailscale.com/net/netutil" @@ -3820,6 +3821,8 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } b.capFileSharing = fs + b.setDebugLogsByCapabilityLocked(nm) + b.setTCPPortsInterceptedFromNetmapAndPrefsLocked(b.pm.CurrentPrefs()) if nm == nil { b.nodeByAddr = nil @@ -3855,6 +3858,18 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } } +// setDebugLogsByCapabilityLocked sets debug logging based on the self node's +// capabilities in the provided NetMap. +func (b *LocalBackend) setDebugLogsByCapabilityLocked(nm *netmap.NetworkMap) { + // These are sufficiently cheap (atomic bools) that we don't need to + // store state and compare. + if hasCapability(nm, tailcfg.CapabilityDebugTSDNSResolution) { + dnscache.SetDebugLoggingEnabled(true) + } else { + dnscache.SetDebugLoggingEnabled(false) + } +} + func (b *LocalBackend) reloadServeConfigLocked(prefs ipn.PrefsView) { if b.netMap == nil || b.netMap.SelfNode == nil || !prefs.Valid() || b.pm.CurrentProfile().ID == "" { // We're not logged in, so we don't have a profile. diff --git a/net/dnscache/dnscache.go b/net/dnscache/dnscache.go index dc117928f..94d97d273 100644 --- a/net/dnscache/dnscache.go +++ b/net/dnscache/dnscache.go @@ -18,9 +18,11 @@ import ( "net/netip" "runtime" "sync" + "sync/atomic" "time" "tailscale.com/envknob" + "tailscale.com/types/logger" "tailscale.com/util/cloudenv" "tailscale.com/util/singleflight" ) @@ -84,6 +86,11 @@ type Resolver struct { // It is required when SingleHostStaticResult is present. SingleHost string + // Logf optionally provides a log function to use for debug logs. If + // not present, log.Printf will be used. The prefix "dnscache: " will + // be added to all log messages printed with this logger. + Logf logger.Logf + sf singleflight.Group[string, ipRes] mu sync.Mutex @@ -110,6 +117,20 @@ func (r *Resolver) fwd() *net.Resolver { return net.DefaultResolver } +// dlogf logs a debug message if debug logging is enabled either globally via +// the TS_DEBUG_DNS_CACHE environment variable or via the per-Resolver +// configuration. +func (r *Resolver) dlogf(format string, args ...any) { + logf := r.Logf + if logf == nil { + logf = log.Printf + } + + if debug() || debugLogging.Load() { + logf("dnscache: "+format, args...) + } +} + // cloudHostResolver returns a Resolver for the current cloud hosting environment. // It currently only supports Google Cloud. func (r *Resolver) cloudHostResolver() (v *net.Resolver, ok bool) { @@ -143,6 +164,27 @@ func (r *Resolver) ttl() time.Duration { var debug = envknob.RegisterBool("TS_DEBUG_DNS_CACHE") +// debugLogging allows enabling debug logging at runtime, via +// SetDebugLoggingEnabled. +// +// This is a global variable instead of a per-Resolver variable because we +// create new Resolvers throughout the lifetime of the program (e.g. on every +// new Direct client, etc.). When we enable debug logs, though, we want to do +// so for every single created Resolver; we'd need to plumb a bunch of new code +// through all of the intermediate packages to accomplish the same behaviour as +// just using a global variable. +var debugLogging atomic.Bool + +// SetDebugLoggingEnabled controls whether debug logging is enabled for this +// package. +// +// These logs are also printed when the TS_DEBUG_DNS_CACHE envknob is set, but +// we allow configuring this manually as well so that it can be changed at +// runtime. +func SetDebugLoggingEnabled(v bool) { + debugLogging.Store(v) +} + // LookupIP returns the host's primary IP address (either IPv4 or // IPv6, but preferring IPv4) and optionally its IPv6 address, if // there is both IPv4 and IPv6. @@ -163,20 +205,17 @@ func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 netip.Addr } allIPs = append(allIPs, naIP) } + r.dlogf("returning %d static results", len(allIPs)) return } if ip, err := netip.ParseAddr(host); err == nil { ip = ip.Unmap() - if debug() { - log.Printf("dnscache: %q is an IP", host) - } + r.dlogf("%q is an IP", host) return ip, zaddr, []netip.Addr{ip}, nil } if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok { - if debug() { - log.Printf("dnscache: %q = %v (cached)", host, ip) - } + r.dlogf("%q = %v (cached)", host, ip) return ip, ip6, allIPs, nil } @@ -192,23 +231,17 @@ func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 netip.Addr if res.Err != nil { if r.UseLastGood { if ip, ip6, allIPs, ok := r.lookupIPCacheExpired(host); ok { - if debug() { - log.Printf("dnscache: %q using %v after error", host, ip) - } + r.dlogf("%q using %v after error", host, ip) return ip, ip6, allIPs, nil } } - if debug() { - log.Printf("dnscache: error resolving %q: %v", host, res.Err) - } + r.dlogf("error resolving %q: %v", host, res.Err) return zaddr, zaddr, nil, res.Err } r := res.Val return r.ip, r.ip6, r.allIPs, nil case <-ctx.Done(): - if debug() { - log.Printf("dnscache: context done while resolving %q: %v", host, ctx.Err()) - } + r.dlogf("context done while resolving %q: %v", host, ctx.Err()) return zaddr, zaddr, nil, ctx.Err() } } @@ -250,9 +283,7 @@ func (r *Resolver) lookupTimeoutForHost(host string) time.Duration { func (r *Resolver) lookupIP(host string) (ip, ip6 netip.Addr, allIPs []netip.Addr, err error) { if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok { - if debug() { - log.Printf("dnscache: %q found in cache as %v", host, ip) - } + r.dlogf("%q found in cache as %v", host, ip) return ip, ip6, allIPs, nil } @@ -261,12 +292,18 @@ func (r *Resolver) lookupIP(host string) (ip, ip6 netip.Addr, allIPs []netip.Add ips, err := r.fwd().LookupNetIP(ctx, "ip", host) if err != nil || len(ips) == 0 { if resolver, ok := r.cloudHostResolver(); ok { + r.dlogf("resolving %q via cloud resolver", host) ips, err = resolver.LookupNetIP(ctx, "ip", host) } } if (err != nil || len(ips) == 0) && r.LookupIPFallback != nil { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() + if err != nil { + r.dlogf("resolving %q using fallback resolver due to error", host) + } else { + r.dlogf("resolving %q using fallback resolver due to no returned IPs", host) + } ips, err = r.LookupIPFallback(ctx, host) } if err != nil { @@ -305,15 +342,11 @@ func (r *Resolver) addIPCache(host string, ip, ip6 netip.Addr, allIPs []netip.Ad if ip.IsPrivate() { // Don't cache obviously wrong entries from captive portals. // TODO: use DoH or DoT for the forwarding resolver? - if debug() { - log.Printf("dnscache: %q resolved to private IP %v; using but not caching", host, ip) - } + r.dlogf("%q resolved to private IP %v; using but not caching", host, ip) return } - if debug() { - log.Printf("dnscache: %q resolved to IP %v; caching", host, ip) - } + r.dlogf("%q resolved to IP %v; caching", host, ip) r.mu.Lock() defer r.mu.Unlock() @@ -387,9 +420,7 @@ func (d *dialer) DialContext(ctx context.Context, network, address string) (retC } i4s := v4addrs(allIPs) if len(i4s) < 2 { - if debug() { - log.Printf("dnscache: dialing %s, %s for %s", network, ip, address) - } + d.dnsCache.dlogf("dialing %s, %s for %s", network, ip, address) c, err := dc.dialOne(ctx, ip.Unmap()) if err == nil || ctx.Err() != nil { return c, err @@ -411,26 +442,20 @@ func (d *dialer) shouldTryBootstrap(ctx context.Context, err error, dc *dialCall // Can't try bootstrap DNS if we don't have a fallback function if d.dnsCache.LookupIPFallback == nil { - if debug() { - log.Printf("dnscache: not using bootstrap DNS: no fallback") - } + d.dnsCache.dlogf("not using bootstrap DNS: no fallback") return false } // We can't retry if the context is canceled, since any further // operations with this context will fail. if ctxErr := ctx.Err(); ctxErr != nil { - if debug() { - log.Printf("dnscache: not using bootstrap DNS: context error: %v", ctxErr) - } + d.dnsCache.dlogf("not using bootstrap DNS: context error: %v", ctxErr) return false } wasTrustworthy := dc.dnsWasTrustworthy() if wasTrustworthy { - if debug() { - log.Printf("dnscache: not using bootstrap DNS: DNS was trustworthy") - } + d.dnsCache.dlogf("not using bootstrap DNS: DNS was trustworthy") return false } diff --git a/net/dnscache/dnscache_test.go b/net/dnscache/dnscache_test.go index c6348266b..f85142ff1 100644 --- a/net/dnscache/dnscache_test.go +++ b/net/dnscache/dnscache_test.go @@ -22,7 +22,7 @@ func TestDialer(t *testing.T) { if *dialTest == "" { t.Skip("skipping; --dial-test is blank") } - r := new(Resolver) + r := &Resolver{Logf: t.Logf} var std net.Dialer dialer := Dialer(std.DialContext, r) t0 := time.Now() @@ -113,6 +113,7 @@ func TestDialCall_uniqueIPs(t *testing.T) { func TestResolverAllHostStaticResult(t *testing.T) { r := &Resolver{ + Logf: t.Logf, SingleHost: "foo.bar", SingleHostStaticResult: []netip.Addr{ netip.MustParseAddr("2001:4860:4860::8888"), @@ -185,11 +186,12 @@ func TestShouldTryBootstrap(t *testing.T) { errFailed := errors.New("some failure") cacheWithFallback := &Resolver{ + Logf: t.Logf, LookupIPFallback: func(_ context.Context, _ string) ([]netip.Addr, error) { panic("unimplemented") }, } - cacheNoFallback := &Resolver{} + cacheNoFallback := &Resolver{Logf: t.Logf} testCases := []struct { name string diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 1848100b0..6c39a5b9a 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -93,7 +93,8 @@ type CapabilityVersion int // - 53: 2023-01-18: client respects explicit Node.Expired + auto-sets based on Node.KeyExpiry // - 54: 2023-01-19: Node.Cap added, PeersChangedPatch.Cap, uses Node.Cap for ExitDNS before Hostinfo.Services fallback // - 55: 2023-01-23: start of c2n GET+POST /update handler -const CurrentCapabilityVersion CapabilityVersion = 55 +// - 56: 2023-01-24: Client understands CapabilityDebugTSDNSResolution +const CurrentCapabilityVersion CapabilityVersion = 56 type StableID string @@ -1754,6 +1755,13 @@ const ( // CapabilityWarnFunnelNoHTTPS indicates HTTPS has not been enabled for the tailnet. CapabilityWarnFunnelNoHTTPS = "https://tailscale.com/cap/warn-funnel-no-https" + + // Debug logging capabilities + + // CapabilityDebugTSDNSResolution enables verbose debug logging for DNS + // resolution for Tailscale-controlled domains (the control server, log + // server, DERP servers, etc.) + CapabilityDebugTSDNSResolution = "https://tailscale.com/cap/debug-ts-dns-resolution" ) const (