From ebc552d2e0650889eb349237b5d17f5ed657a6dd Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Thu, 25 Apr 2024 13:24:49 -0700 Subject: [PATCH] health: add Tracker type, in prep for removing global variables This moves most of the health package global variables to a new `health.Tracker` type. But then rather than plumbing the Tracker in tsd.System everywhere, this only goes halfway and makes one new global Tracker (`health.Global`) that all the existing callers now use. A future change will eliminate that global. Updates #11874 Updates #4136 Change-Id: I6ee27e0b2e35f68cb38fecdb3b2dc4c3f2e09d68 Signed-off-by: Brad Fitzpatrick --- control/controlclient/auto.go | 12 +- control/controlclient/direct.go | 8 +- health/health.go | 392 ++++++++++++++------------- health/health_test.go | 14 +- ipn/ipnlocal/local.go | 22 +- ipn/ipnlocal/network-lock.go | 10 +- ipn/localapi/localapi.go | 2 +- net/dns/direct_linux.go | 2 +- net/dns/manager.go | 6 +- net/dns/manager_linux.go | 2 +- net/dns/resolved.go | 4 +- net/tlsdial/tlsdial.go | 4 +- wgengine/magicsock/derp.go | 22 +- wgengine/magicsock/magicsock.go | 6 +- wgengine/magicsock/magicsock_test.go | 10 +- wgengine/router/ifconfig_windows.go | 2 +- wgengine/userspace.go | 6 +- 17 files changed, 268 insertions(+), 256 deletions(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index d0551cdab..0cc236e7e 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -195,7 +195,7 @@ func NewNoStart(opts Options) (_ *Auto, err error) { c.mapCtx, c.mapCancel = context.WithCancel(context.Background()) c.mapCtx = sockstats.WithSockStats(c.mapCtx, sockstats.LabelControlClientAuto, opts.Logf) - c.unregisterHealthWatch = health.RegisterWatcher(direct.ReportHealthChange) + c.unregisterHealthWatch = health.Global.RegisterWatcher(direct.ReportHealthChange) return c, nil } @@ -316,7 +316,7 @@ func (c *Auto) authRoutine() { } if goal == nil { - health.SetAuthRoutineInError(nil) + health.Global.SetAuthRoutineInError(nil) // Wait for user to Login or Logout. <-ctx.Done() c.logf("[v1] authRoutine: context done.") @@ -343,7 +343,7 @@ func (c *Auto) authRoutine() { f = "TryLogin" } if err != nil { - health.SetAuthRoutineInError(err) + health.Global.SetAuthRoutineInError(err) report(err, f) bo.BackOff(ctx, err) continue @@ -373,7 +373,7 @@ func (c *Auto) authRoutine() { } // success - health.SetAuthRoutineInError(nil) + health.Global.SetAuthRoutineInError(nil) c.mu.Lock() c.urlToVisit = "" c.loggedIn = true @@ -503,11 +503,11 @@ func (c *Auto) mapRoutine() { c.logf("[v1] mapRoutine: context done.") continue } - health.SetOutOfPollNetMap() + health.Global.SetOutOfPollNetMap() err := c.direct.PollNetMap(ctx, mrs) - health.SetOutOfPollNetMap() + health.Global.SetOutOfPollNetMap() c.mu.Lock() c.inMapPoll = false if c.state == StateSynchronized { diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index c727d196d..ee7c79a2d 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -894,10 +894,10 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap ipForwardingBroken(hi.RoutableIPs, c.netMon.InterfaceState()) { extraDebugFlags = append(extraDebugFlags, "warn-ip-forwarding-off") } - if health.RouterHealth() != nil { + if health.Global.RouterHealth() != nil { extraDebugFlags = append(extraDebugFlags, "warn-router-unhealthy") } - extraDebugFlags = health.AppendWarnableDebugFlags(extraDebugFlags) + extraDebugFlags = health.Global.AppendWarnableDebugFlags(extraDebugFlags) if hostinfo.DisabledEtcAptSource() { extraDebugFlags = append(extraDebugFlags, "warn-etc-apt-source-disabled") } @@ -970,7 +970,7 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap } defer res.Body.Close() - health.NoteMapRequestHeard(request) + health.Global.NoteMapRequestHeard(request) watchdogTimer.Reset(watchdogTimeout) if nu == nil { @@ -1041,7 +1041,7 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap metricMapResponseMessages.Add(1) if isStreaming { - health.GotStreamedMapResponse() + health.Global.GotStreamedMapResponse() } if pr := resp.PingRequest; pr != nil && c.isUniquePingRequest(pr) { diff --git a/health/health.go b/health/health.go index c1ca46bc6..861305342 100644 --- a/health/health.go +++ b/health/health.go @@ -17,40 +17,51 @@ import ( "tailscale.com/envknob" "tailscale.com/tailcfg" + "tailscale.com/types/opt" + "tailscale.com/util/mak" "tailscale.com/util/multierr" "tailscale.com/util/set" ) var ( + mu sync.Mutex + debugHandler map[string]http.Handler +) + +// Global is a global health tracker for the process. +// +// TODO(bradfitz): move this to tsd.System so a process can have multiple +// tsnet/etc instances with their own health trackers. +var Global = new(Tracker) + +type Tracker struct { // mu guards everything in this var block. mu sync.Mutex - sysErr = map[Subsystem]error{} // error key => err (or nil for no error) - watchers = set.HandleSet[func(Subsystem, error)]{} // opt func to run if error state changes - warnables = set.Set[*Warnable]{} + sysErr map[Subsystem]error // subsystem => err (or nil for no error) + watchers set.HandleSet[func(Subsystem, error)] // opt func to run if error state changes + warnables set.Set[*Warnable] timer *time.Timer - debugHandler = map[string]http.Handler{} - inMapPoll bool inMapPollSince time.Time lastMapPollEndedAt time.Time lastStreamedMapResponse time.Time derpHomeRegion int derpHomeless bool - derpRegionConnected = map[int]bool{} - derpRegionHealthProblem = map[int]string{} - derpRegionLastFrame = map[int]time.Time{} + derpRegionConnected map[int]bool + derpRegionHealthProblem map[int]string + derpRegionLastFrame map[int]time.Time lastMapRequestHeard time.Time // time we got a 200 from control for a MapRequest ipnState string ipnWantRunning bool - anyInterfaceUp = true // until told otherwise + anyInterfaceUp opt.Bool // empty means unknown (assume true) udp4Unbound bool controlHealth []string lastLoginErr error localLogConfigErr error - tlsConnectionErrors = map[string]error{} // map[ServerName]error -) + tlsConnectionErrors map[string]error // map[ServerName]error +} // Subsystem is the name of a subsystem whose health can be monitored. type Subsystem string @@ -78,14 +89,17 @@ const ( // NewWarnable returns a new warnable item that the caller can mark // as health or in warning state. -func NewWarnable(opts ...WarnableOpt) *Warnable { +func (t *Tracker) NewWarnable(opts ...WarnableOpt) *Warnable { w := new(Warnable) for _, o := range opts { o.mod(w) } - mu.Lock() - defer mu.Unlock() - warnables.Add(w) + t.mu.Lock() + defer t.mu.Unlock() + if t.warnables == nil { + t.warnables = set.Set[*Warnable]{} + } + t.warnables.Add(w) return w } @@ -151,12 +165,12 @@ func (w *Warnable) get() error { // AppendWarnableDebugFlags appends to base any health items that are currently in failed // state and were created with MapDebugFlag. -func AppendWarnableDebugFlags(base []string) []string { +func (t *Tracker) AppendWarnableDebugFlags(base []string) []string { ret := base - mu.Lock() - defer mu.Unlock() - for w := range warnables { + t.mu.Lock() + defer t.mu.Unlock() + for w := range t.warnables { if w.debugFlag == "" { continue } @@ -172,75 +186,78 @@ func AppendWarnableDebugFlags(base []string) []string { // error changes state either to unhealthy or from unhealthy. It is // not called on transition from unknown to healthy. It must be non-nil // and is run in its own goroutine. The returned func unregisters it. -func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { - mu.Lock() - defer mu.Unlock() - handle := watchers.Add(cb) - if timer == nil { - timer = time.AfterFunc(time.Minute, timerSelfCheck) +func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { + t.mu.Lock() + defer t.mu.Unlock() + if t.watchers == nil { + t.watchers = set.HandleSet[func(Subsystem, error)]{} + } + handle := t.watchers.Add(cb) + if t.timer == nil { + t.timer = time.AfterFunc(time.Minute, t.timerSelfCheck) } return func() { - mu.Lock() - defer mu.Unlock() - delete(watchers, handle) - if len(watchers) == 0 && timer != nil { - timer.Stop() - timer = nil + t.mu.Lock() + defer t.mu.Unlock() + delete(t.watchers, handle) + if len(t.watchers) == 0 && t.timer != nil { + t.timer.Stop() + t.timer = nil } } } // SetRouterHealth sets the state of the wgengine/router.Router. -func SetRouterHealth(err error) { setErr(SysRouter, err) } +func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) } // RouterHealth returns the wgengine/router.Router error state. -func RouterHealth() error { return get(SysRouter) } +func (t *Tracker) RouterHealth() error { return t.get(SysRouter) } // SetDNSHealth sets the state of the net/dns.Manager -func SetDNSHealth(err error) { setErr(SysDNS, err) } +func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) } // DNSHealth returns the net/dns.Manager error state. -func DNSHealth() error { return get(SysDNS) } +func (t *Tracker) DNSHealth() error { return t.get(SysDNS) } // SetDNSOSHealth sets the state of the net/dns.OSConfigurator -func SetDNSOSHealth(err error) { setErr(SysDNSOS, err) } +func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) } // SetDNSManagerHealth sets the state of the Linux net/dns manager's // discovery of the /etc/resolv.conf situation. -func SetDNSManagerHealth(err error) { setErr(SysDNSManager, err) } +func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) } // DNSOSHealth returns the net/dns.OSConfigurator error state. -func DNSOSHealth() error { return get(SysDNSOS) } +func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) } // SetTKAHealth sets the health of the tailnet key authority. -func SetTKAHealth(err error) { setErr(SysTKA, err) } +func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) } // TKAHealth returns the tailnet key authority error state. -func TKAHealth() error { return get(SysTKA) } +func (t *Tracker) TKAHealth() error { return t.get(SysTKA) } // SetLocalLogConfigHealth sets the error state of this client's local log configuration. -func SetLocalLogConfigHealth(err error) { - mu.Lock() - defer mu.Unlock() - localLogConfigErr = err +func (t *Tracker) SetLocalLogConfigHealth(err error) { + t.mu.Lock() + defer t.mu.Unlock() + t.localLogConfigErr = err } // SetTLSConnectionError sets the error state for connections to a specific // host. Setting the error to nil will clear any previously-set error. -func SetTLSConnectionError(host string, err error) { - mu.Lock() - defer mu.Unlock() +func (t *Tracker) SetTLSConnectionError(host string, err error) { + t.mu.Lock() + defer t.mu.Unlock() if err == nil { - delete(tlsConnectionErrors, host) + delete(t.tlsConnectionErrors, host) } else { - tlsConnectionErrors[host] = err + mak.Set(&t.tlsConnectionErrors, host, err) } } func RegisterDebugHandler(typ string, h http.Handler) { mu.Lock() defer mu.Unlock() - debugHandler[typ] = h + mak.Set(&debugHandler, typ, h) } func DebugHandler(typ string) http.Handler { @@ -249,24 +266,27 @@ func DebugHandler(typ string) http.Handler { return debugHandler[typ] } -func get(key Subsystem) error { - mu.Lock() - defer mu.Unlock() - return sysErr[key] +func (t *Tracker) get(key Subsystem) error { + t.mu.Lock() + defer t.mu.Unlock() + return t.sysErr[key] } -func setErr(key Subsystem, err error) { - mu.Lock() - defer mu.Unlock() - setLocked(key, err) +func (t *Tracker) setErr(key Subsystem, err error) { + t.mu.Lock() + defer t.mu.Unlock() + t.setLocked(key, err) } -func setLocked(key Subsystem, err error) { - old, ok := sysErr[key] +func (t *Tracker) setLocked(key Subsystem, err error) { + if t.sysErr == nil { + t.sysErr = map[Subsystem]error{} + } + old, ok := t.sysErr[key] if !ok && err == nil { // Initial happy path. - sysErr[key] = nil - selfCheckLocked() + t.sysErr[key] = nil + t.selfCheckLocked() return } if ok && (old == nil) == (err == nil) { @@ -274,22 +294,22 @@ func setLocked(key Subsystem, err error) { // don't run callbacks, but exact error might've // changed, so note it. if err != nil { - sysErr[key] = err + t.sysErr[key] = err } return } - sysErr[key] = err - selfCheckLocked() - for _, cb := range watchers { + t.sysErr[key] = err + t.selfCheckLocked() + for _, cb := range t.watchers { go cb(key, err) } } -func SetControlHealth(problems []string) { - mu.Lock() - defer mu.Unlock() - controlHealth = problems - selfCheckLocked() +func (t *Tracker) SetControlHealth(problems []string) { + t.mu.Lock() + defer t.mu.Unlock() + t.controlHealth = problems + t.selfCheckLocked() } // GotStreamedMapResponse notes that we got a tailcfg.MapResponse @@ -297,161 +317,161 @@ func SetControlHealth(problems []string) { // // This also notes that a map poll is in progress. To unset that, call // SetOutOfPollNetMap(). -func GotStreamedMapResponse() { - mu.Lock() - defer mu.Unlock() - lastStreamedMapResponse = time.Now() - if !inMapPoll { - inMapPoll = true - inMapPollSince = time.Now() +func (t *Tracker) GotStreamedMapResponse() { + t.mu.Lock() + defer t.mu.Unlock() + t.lastStreamedMapResponse = time.Now() + if !t.inMapPoll { + t.inMapPoll = true + t.inMapPollSince = time.Now() } - selfCheckLocked() + t.selfCheckLocked() } // SetOutOfPollNetMap records that the client is no longer in // an HTTP map request long poll to the control plane. -func SetOutOfPollNetMap() { - mu.Lock() - defer mu.Unlock() - if !inMapPoll { +func (t *Tracker) SetOutOfPollNetMap() { + t.mu.Lock() + defer t.mu.Unlock() + if !t.inMapPoll { return } - inMapPoll = false - lastMapPollEndedAt = time.Now() - selfCheckLocked() + t.inMapPoll = false + t.lastMapPollEndedAt = time.Now() + t.selfCheckLocked() } // GetInPollNetMap reports whether the client has an open // HTTP long poll open to the control plane. -func GetInPollNetMap() bool { - mu.Lock() - defer mu.Unlock() - return inMapPoll +func (t *Tracker) GetInPollNetMap() bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.inMapPoll } // SetMagicSockDERPHome notes what magicsock's view of its home DERP is. // // The homeless parameter is whether magicsock is running in DERP-disconnected // mode, without discovering and maintaining a connection to its home DERP. -func SetMagicSockDERPHome(region int, homeless bool) { - mu.Lock() - defer mu.Unlock() - derpHomeRegion = region - derpHomeless = homeless - selfCheckLocked() +func (t *Tracker) SetMagicSockDERPHome(region int, homeless bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.derpHomeRegion = region + t.derpHomeless = homeless + t.selfCheckLocked() } // NoteMapRequestHeard notes whenever we successfully sent a map request // to control for which we received a 200 response. -func NoteMapRequestHeard(mr *tailcfg.MapRequest) { - mu.Lock() - defer mu.Unlock() +func (t *Tracker) NoteMapRequestHeard(mr *tailcfg.MapRequest) { + t.mu.Lock() + defer t.mu.Unlock() // TODO: extract mr.HostInfo.NetInfo.PreferredDERP, compare // against SetMagicSockDERPHome and // SetDERPRegionConnectedState - lastMapRequestHeard = time.Now() - selfCheckLocked() + t.lastMapRequestHeard = time.Now() + t.selfCheckLocked() } -func SetDERPRegionConnectedState(region int, connected bool) { - mu.Lock() - defer mu.Unlock() - derpRegionConnected[region] = connected - selfCheckLocked() +func (t *Tracker) SetDERPRegionConnectedState(region int, connected bool) { + t.mu.Lock() + defer t.mu.Unlock() + mak.Set(&t.derpRegionConnected, region, connected) + t.selfCheckLocked() } // SetDERPRegionHealth sets or clears any problem associated with the // provided DERP region. -func SetDERPRegionHealth(region int, problem string) { - mu.Lock() - defer mu.Unlock() +func (t *Tracker) SetDERPRegionHealth(region int, problem string) { + t.mu.Lock() + defer t.mu.Unlock() if problem == "" { - delete(derpRegionHealthProblem, region) + delete(t.derpRegionHealthProblem, region) } else { - derpRegionHealthProblem[region] = problem + mak.Set(&t.derpRegionHealthProblem, region, problem) } - selfCheckLocked() + t.selfCheckLocked() } // NoteDERPRegionReceivedFrame is called to note that a frame was received from // the given DERP region at the current time. -func NoteDERPRegionReceivedFrame(region int) { - mu.Lock() - defer mu.Unlock() - derpRegionLastFrame[region] = time.Now() - selfCheckLocked() +func (t *Tracker) NoteDERPRegionReceivedFrame(region int) { + t.mu.Lock() + defer t.mu.Unlock() + mak.Set(&t.derpRegionLastFrame, region, time.Now()) + t.selfCheckLocked() } // GetDERPRegionReceivedTime returns the last time that a frame was received // from the given DERP region, or the zero time if no communication with that // region has occurred. -func GetDERPRegionReceivedTime(region int) time.Time { - mu.Lock() - defer mu.Unlock() - return derpRegionLastFrame[region] +func (t *Tracker) GetDERPRegionReceivedTime(region int) time.Time { + t.mu.Lock() + defer t.mu.Unlock() + return t.derpRegionLastFrame[region] } // state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc. -func SetIPNState(state string, wantRunning bool) { - mu.Lock() - defer mu.Unlock() - ipnState = state - ipnWantRunning = wantRunning - selfCheckLocked() +func (t *Tracker) SetIPNState(state string, wantRunning bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.ipnState = state + t.ipnWantRunning = wantRunning + t.selfCheckLocked() } // SetAnyInterfaceUp sets whether any network interface is up. -func SetAnyInterfaceUp(up bool) { - mu.Lock() - defer mu.Unlock() - anyInterfaceUp = up - selfCheckLocked() +func (t *Tracker) SetAnyInterfaceUp(up bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.anyInterfaceUp.Set(up) + t.selfCheckLocked() } // SetUDP4Unbound sets whether the udp4 bind failed completely. -func SetUDP4Unbound(unbound bool) { - mu.Lock() - defer mu.Unlock() - udp4Unbound = unbound - selfCheckLocked() +func (t *Tracker) SetUDP4Unbound(unbound bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.udp4Unbound = unbound + t.selfCheckLocked() } // SetAuthRoutineInError records the latest error encountered as a result of a // login attempt. Providing a nil error indicates successful login, or that // being logged in w/coordination is not currently desired. -func SetAuthRoutineInError(err error) { - mu.Lock() - defer mu.Unlock() - lastLoginErr = err +func (t *Tracker) SetAuthRoutineInError(err error) { + t.mu.Lock() + defer t.mu.Unlock() + t.lastLoginErr = err } -func timerSelfCheck() { - mu.Lock() - defer mu.Unlock() +func (t *Tracker) timerSelfCheck() { + t.mu.Lock() + defer t.mu.Unlock() checkReceiveFuncs() - selfCheckLocked() - if timer != nil { - timer.Reset(time.Minute) + t.selfCheckLocked() + if t.timer != nil { + t.timer.Reset(time.Minute) } } -func selfCheckLocked() { - if ipnState == "" { +func (t *Tracker) selfCheckLocked() { + if t.ipnState == "" { // Don't check yet. return } - setLocked(SysOverall, overallErrorLocked()) + t.setLocked(SysOverall, t.overallErrorLocked()) } // OverallError returns a summary of the health state. // // If there are multiple problems, the error will be of type // multierr.Error. -func OverallError() error { - mu.Lock() - defer mu.Unlock() - return overallErrorLocked() +func (t *Tracker) OverallError() error { + t.mu.Lock() + defer t.mu.Unlock() + return t.overallErrorLocked() } var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") @@ -459,11 +479,11 @@ var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") // networkErrorf creates an error that indicates issues with outgoing network // connectivity. Any active warnings related to network connectivity will // automatically be appended to it. -func networkErrorf(format string, a ...any) error { +func (t *Tracker) networkErrorf(format string, a ...any) error { errs := []error{ fmt.Errorf(format, a...), } - for w := range warnables { + for w := range t.warnables { if !w.hasConnectivityImpact { continue } @@ -477,53 +497,53 @@ func networkErrorf(format string, a ...any) error { return multierr.New(errs...) } -var errNetworkDown = networkErrorf("network down") -var errNotInMapPoll = networkErrorf("not in map poll") +var errNetworkDown = errors.New("network down") +var errNotInMapPoll = errors.New("not in map poll") var errNoDERPHome = errors.New("no DERP home") -var errNoUDP4Bind = networkErrorf("no udp4 bind") +var errNoUDP4Bind = errors.New("no udp4 bind") -func overallErrorLocked() error { - if !anyInterfaceUp { +func (t *Tracker) overallErrorLocked() error { + if v, ok := t.anyInterfaceUp.Get(); ok && !v { return errNetworkDown } - if localLogConfigErr != nil { - return localLogConfigErr + if t.localLogConfigErr != nil { + return t.localLogConfigErr } - if !ipnWantRunning { - return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning) + if !t.ipnWantRunning { + return fmt.Errorf("state=%v, wantRunning=%v", t.ipnState, t.ipnWantRunning) } - if lastLoginErr != nil { - return fmt.Errorf("not logged in, last login error=%v", lastLoginErr) + if t.lastLoginErr != nil { + return fmt.Errorf("not logged in, last login error=%v", t.lastLoginErr) } now := time.Now() - if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) { + if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) { return errNotInMapPoll } const tooIdle = 2*time.Minute + 5*time.Second - if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle { - return networkErrorf("no map response in %v", d) + if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle { + return t.networkErrorf("no map response in %v", d) } - if !derpHomeless { - rid := derpHomeRegion + if !t.derpHomeless { + rid := t.derpHomeRegion if rid == 0 { return errNoDERPHome } - if !derpRegionConnected[rid] { - return networkErrorf("not connected to home DERP region %v", rid) + if !t.derpRegionConnected[rid] { + return t.networkErrorf("not connected to home DERP region %v", rid) } - if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { - return networkErrorf("haven't heard from home DERP region %v in %v", rid, d) + if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { + return t.networkErrorf("haven't heard from home DERP region %v in %v", rid, d) } } - if udp4Unbound { + if t.udp4Unbound { return errNoUDP4Bind } // TODO: use - _ = inMapPollSince - _ = lastMapPollEndedAt - _ = lastStreamedMapResponse - _ = lastMapRequestHeard + _ = t.inMapPollSince + _ = t.lastMapPollEndedAt + _ = t.lastStreamedMapResponse + _ = t.lastMapRequestHeard var errs []error for _, recv := range receiveFuncs { @@ -531,27 +551,27 @@ func overallErrorLocked() error { errs = append(errs, fmt.Errorf("%s is not running", recv.name)) } } - for sys, err := range sysErr { + for sys, err := range t.sysErr { if err == nil || sys == SysOverall { continue } errs = append(errs, fmt.Errorf("%v: %w", sys, err)) } - for w := range warnables { + for w := range t.warnables { if err := w.get(); err != nil { errs = append(errs, err) } } - for regionID, problem := range derpRegionHealthProblem { + for regionID, problem := range t.derpRegionHealthProblem { errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem)) } - for _, s := range controlHealth { + for _, s := range t.controlHealth { errs = append(errs, errors.New(s)) } if err := envknob.ApplyDiskConfigError(); err != nil { errs = append(errs, err) } - for serverName, err := range tlsConnectionErrors { + for serverName, err := range t.tlsConnectionErrors { errs = append(errs, fmt.Errorf("TLS connection error for %q: %w", serverName, err)) } if e := fakeErrForTesting(); len(errs) == 0 && e != "" { diff --git a/health/health_test.go b/health/health_test.go index 552cec495..e8d69eee1 100644 --- a/health/health_test.go +++ b/health/health_test.go @@ -8,15 +8,13 @@ import ( "fmt" "reflect" "testing" - - "tailscale.com/util/set" ) func TestAppendWarnableDebugFlags(t *testing.T) { - resetWarnables() + var tr Tracker for i := range 10 { - w := NewWarnable(WithMapDebugFlag(fmt.Sprint(i))) + w := tr.NewWarnable(WithMapDebugFlag(fmt.Sprint(i))) if i%2 == 0 { w.Set(errors.New("boom")) } @@ -27,15 +25,9 @@ func TestAppendWarnableDebugFlags(t *testing.T) { var got []string for range 20 { got = append(got[:0], "z", "y") - got = AppendWarnableDebugFlags(got) + got = tr.AppendWarnableDebugFlags(got) if !reflect.DeepEqual(got, want) { t.Fatalf("AppendWarnableDebugFlags = %q; want %q", got, want) } } } - -func resetWarnables() { - mu.Lock() - defer mu.Unlock() - warnables = set.Set[*Warnable]{} -} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 6a6036031..dd3118acc 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -426,7 +426,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo b.linkChange(&netmon.ChangeDelta{New: netMon.InterfaceState()}) b.unregisterNetMon = netMon.RegisterChangeCallback(b.linkChange) - b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange) + b.unregisterHealthWatch = health.Global.RegisterWatcher(b.onHealthChange) if tunWrap, ok := b.sys.Tun.GetOK(); ok { tunWrap.PeerAPIPort = b.GetPeerAPIPort @@ -761,7 +761,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) { } } } - if err := health.OverallError(); err != nil { + if err := health.Global.OverallError(); err != nil { switch e := err.(type) { case multierr.Error: for _, err := range e.Errors() { @@ -820,7 +820,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) { sb.MutateSelfStatus(func(ss *ipnstate.PeerStatus) { ss.OS = version.OS() - ss.Online = health.GetInPollNetMap() + ss.Online = health.Global.GetInPollNetMap() if b.netMap != nil { ss.InNetworkMap = true if hi := b.netMap.SelfNode.Hostinfo(); hi.Valid() { @@ -1221,7 +1221,7 @@ func (b *LocalBackend) SetControlClientStatus(c controlclient.Client, st control if st.NetMap != nil { if envknob.NoLogsNoSupport() && st.NetMap.HasCap(tailcfg.CapabilityDataPlaneAuditLogs) { msg := "tailnet requires logging to be enabled. Remove --no-logs-no-support from tailscaled command line." - health.SetLocalLogConfigHealth(errors.New(msg)) + health.Global.SetLocalLogConfigHealth(errors.New(msg)) // Connecting to this tailnet without logging is forbidden; boot us outta here. b.mu.Lock() prefs.WantRunning = false @@ -1818,7 +1818,7 @@ func (b *LocalBackend) Start(opts ipn.Options) error { return nil } -var warnInvalidUnsignedNodes = health.NewWarnable() +var warnInvalidUnsignedNodes = health.Global.NewWarnable() // updateFilterLocked updates the packet filter in wgengine based on the // given netMap and user preferences. @@ -3044,7 +3044,7 @@ func (b *LocalBackend) isDefaultServerLocked() bool { return prefs.ControlURLOrDefault() == ipn.DefaultControlURL } -var warnExitNodeUsage = health.NewWarnable(health.WithConnectivityImpact()) +var warnExitNodeUsage = health.Global.NewWarnable(health.WithConnectivityImpact()) // updateExitNodeUsageWarning updates a warnable meant to notify users of // configuration issues that could break exit node usage. @@ -4254,7 +4254,7 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock // prefs may change irrespective of state; WantRunning should be explicitly // set before potential early return even if the state is unchanged. - health.SetIPNState(newState.String(), prefs.Valid() && prefs.WantRunning()) + health.Global.SetIPNState(newState.String(), prefs.Valid() && prefs.WantRunning()) if oldState == newState { return } @@ -4692,9 +4692,9 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { b.pauseOrResumeControlClientLocked() if nm != nil { - health.SetControlHealth(nm.ControlHealth) + health.Global.SetControlHealth(nm.ControlHealth) } else { - health.SetControlHealth(nil) + health.Global.SetControlHealth(nil) } // Determine if file sharing is enabled @@ -5675,7 +5675,7 @@ func (b *LocalBackend) sshServerOrInit() (_ SSHServer, err error) { return b.sshServer, nil } -var warnSSHSELinux = health.NewWarnable() +var warnSSHSELinux = health.Global.NewWarnable() func (b *LocalBackend) updateSELinuxHealthWarning() { if hostinfo.IsSELinuxEnforcing() { @@ -5908,7 +5908,7 @@ func (b *LocalBackend) resetForProfileChangeLockedOnEntry(unlock unlockOnce) err b.lastServeConfJSON = mem.B(nil) b.serveConfig = ipn.ServeConfigView{} b.enterStateLockedOnEntry(ipn.NoState, unlock) // Reset state; releases b.mu - health.SetLocalLogConfigHealth(nil) + health.Global.SetLocalLogConfigHealth(nil) return b.Start(ipn.Options{}) } diff --git a/ipn/ipnlocal/network-lock.go b/ipn/ipnlocal/network-lock.go index 2f5d5b09c..17d9eafd0 100644 --- a/ipn/ipnlocal/network-lock.go +++ b/ipn/ipnlocal/network-lock.go @@ -59,11 +59,11 @@ type tkaState struct { // b.mu must be held. func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) { if b.tka == nil && !b.capTailnetLock { - health.SetTKAHealth(nil) + health.Global.SetTKAHealth(nil) return } if b.tka == nil { - health.SetTKAHealth(nil) + health.Global.SetTKAHealth(nil) return // TKA not enabled. } @@ -117,9 +117,9 @@ func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) { // Check that we ourselves are not locked out, report a health issue if so. if nm.SelfNode.Valid() && b.tka.authority.NodeKeyAuthorized(nm.SelfNode.Key(), nm.SelfNode.KeySignature().AsSlice()) != nil { - health.SetTKAHealth(errors.New(healthmsg.LockedOut)) + health.Global.SetTKAHealth(errors.New(healthmsg.LockedOut)) } else { - health.SetTKAHealth(nil) + health.Global.SetTKAHealth(nil) } } @@ -188,7 +188,7 @@ func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap, prefs ipn.PrefsVie b.logf("Disablement failed, leaving TKA enabled. Error: %v", err) } else { isEnabled = false - health.SetTKAHealth(nil) + health.Global.SetTKAHealth(nil) } } else { return fmt.Errorf("[bug] unreachable invariant of wantEnabled w/ isEnabled") diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index 7f70dadee..7c8b97552 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -358,7 +358,7 @@ func (h *Handler) serveBugReport(w http.ResponseWriter, r *http.Request) { } hi, _ := json.Marshal(hostinfo.New()) h.logf("user bugreport hostinfo: %s", hi) - if err := health.OverallError(); err != nil { + if err := health.Global.OverallError(); err != nil { h.logf("user bugreport health: %s", err.Error()) } else { h.logf("user bugreport health: ok") diff --git a/net/dns/direct_linux.go b/net/dns/direct_linux.go index ad5a2163a..a2556b720 100644 --- a/net/dns/direct_linux.go +++ b/net/dns/direct_linux.go @@ -58,7 +58,7 @@ func (m *directManager) runFileWatcher() { } } -var warnTrample = health.NewWarnable() +var warnTrample = health.Global.NewWarnable() // checkForFileTrample checks whether /etc/resolv.conf has been trampled // by another program on the system. (e.g. a DHCP client) diff --git a/net/dns/manager.go b/net/dns/manager.go index 32b372b4f..bb9fce611 100644 --- a/net/dns/manager.go +++ b/net/dns/manager.go @@ -94,10 +94,10 @@ func (m *Manager) Set(cfg Config) error { return err } if err := m.os.SetDNS(ocfg); err != nil { - health.SetDNSOSHealth(err) + health.Global.SetDNSOSHealth(err) return err } - health.SetDNSOSHealth(nil) + health.Global.SetDNSOSHealth(nil) return nil } @@ -248,7 +248,7 @@ func (m *Manager) compileConfig(cfg Config) (rcfg resolver.Config, ocfg OSConfig // This is currently (2022-10-13) expected on certain iOS and macOS // builds. } else { - health.SetDNSOSHealth(err) + health.Global.SetDNSOSHealth(err) return resolver.Config{}, OSConfig{}, err } } diff --git a/net/dns/manager_linux.go b/net/dns/manager_linux.go index e2a7224bc..17162153c 100644 --- a/net/dns/manager_linux.go +++ b/net/dns/manager_linux.go @@ -271,7 +271,7 @@ func dnsMode(logf logger.Logf, env newOSConfigEnv) (ret string, err error) { return "direct", nil } - health.SetDNSManagerHealth(errors.New("systemd-resolved and NetworkManager are wired together incorrectly; MagicDNS will probably not work. For more info, see https://tailscale.com/s/resolved-nm")) + health.Global.SetDNSManagerHealth(errors.New("systemd-resolved and NetworkManager are wired together incorrectly; MagicDNS will probably not work. For more info, see https://tailscale.com/s/resolved-nm")) dbg("nm-safe", "no") return "systemd-resolved", nil default: diff --git a/net/dns/resolved.go b/net/dns/resolved.go index 2917ab5b3..47469c125 100644 --- a/net/dns/resolved.go +++ b/net/dns/resolved.go @@ -163,7 +163,7 @@ func (m *resolvedManager) run(ctx context.Context) { // Reset backoff and SetNSOSHealth after successful on reconnect. bo.BackOff(ctx, nil) - health.SetDNSOSHealth(nil) + health.Global.SetDNSOSHealth(nil) return nil } @@ -241,7 +241,7 @@ func (m *resolvedManager) run(ctx context.Context) { // Set health while holding the lock, because this will // graciously serialize the resync's health outcome with a // concurrent SetDNS call. - health.SetDNSOSHealth(err) + health.Global.SetDNSOSHealth(err) if err != nil { m.logf("failed to configure systemd-resolved: %v", err) } diff --git a/net/tlsdial/tlsdial.go b/net/tlsdial/tlsdial.go index 2c9be4e1c..ddd2ee064 100644 --- a/net/tlsdial/tlsdial.go +++ b/net/tlsdial/tlsdial.go @@ -80,10 +80,10 @@ func Config(host string, base *tls.Config) *tls.Config { // any verification. if certIsSelfSigned(cs.PeerCertificates[0]) { // Self-signed certs are never valid. - health.SetTLSConnectionError(cs.ServerName, fmt.Errorf("certificate is self-signed")) + health.Global.SetTLSConnectionError(cs.ServerName, fmt.Errorf("certificate is self-signed")) } else { // Ensure we clear any error state for this ServerName. - health.SetTLSConnectionError(cs.ServerName, nil) + health.Global.SetTLSConnectionError(cs.ServerName, nil) } // First try doing x509 verification with the system's diff --git a/wgengine/magicsock/derp.go b/wgengine/magicsock/derp.go index 32611546f..b6cecb55d 100644 --- a/wgengine/magicsock/derp.go +++ b/wgengine/magicsock/derp.go @@ -165,7 +165,7 @@ func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int) if testenv.InTest() && !checkControlHealthDuringNearestDERPInTests { connectedToControl = true } else { - connectedToControl = health.GetInPollNetMap() + connectedToControl = health.Global.GetInPollNetMap() } if !connectedToControl { c.mu.Lock() @@ -201,12 +201,12 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) { defer c.mu.Unlock() if !c.wantDerpLocked() { c.myDerp = 0 - health.SetMagicSockDERPHome(0, c.homeless) + health.Global.SetMagicSockDERPHome(0, c.homeless) return false } if c.homeless { c.myDerp = 0 - health.SetMagicSockDERPHome(0, c.homeless) + health.Global.SetMagicSockDERPHome(0, c.homeless) return false } if derpNum == c.myDerp { @@ -217,7 +217,7 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) { metricDERPHomeChange.Add(1) } c.myDerp = derpNum - health.SetMagicSockDERPHome(derpNum, c.homeless) + health.Global.SetMagicSockDERPHome(derpNum, c.homeless) if c.privateKey.IsZero() { // No private key yet, so DERP connections won't come up anyway. @@ -525,8 +525,8 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d return n } - defer health.SetDERPRegionConnectedState(regionID, false) - defer health.SetDERPRegionHealth(regionID, "") + defer health.Global.SetDERPRegionConnectedState(regionID, false) + defer health.Global.SetDERPRegionHealth(regionID, "") // peerPresent is the set of senders we know are present on this // connection, based on messages we've received from the server. @@ -538,7 +538,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d for { msg, connGen, err := dc.RecvDetail() if err != nil { - health.SetDERPRegionConnectedState(regionID, false) + health.Global.SetDERPRegionConnectedState(regionID, false) // Forget that all these peers have routes. for peer := range peerPresent { delete(peerPresent, peer) @@ -576,14 +576,14 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d now := time.Now() if lastPacketTime.IsZero() || now.Sub(lastPacketTime) > frameReceiveRecordRate { - health.NoteDERPRegionReceivedFrame(regionID) + health.Global.NoteDERPRegionReceivedFrame(regionID) lastPacketTime = now } switch m := msg.(type) { case derp.ServerInfoMessage: - health.SetDERPRegionConnectedState(regionID, true) - health.SetDERPRegionHealth(regionID, "") // until declared otherwise + health.Global.SetDERPRegionConnectedState(regionID, true) + health.Global.SetDERPRegionHealth(regionID, "") // until declared otherwise c.logf("magicsock: derp-%d connected; connGen=%v", regionID, connGen) continue case derp.ReceivedPacket: @@ -623,7 +623,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d }() continue case derp.HealthMessage: - health.SetDERPRegionHealth(regionID, m.Problem) + health.Global.SetDERPRegionHealth(regionID, m.Problem) continue case derp.PeerGoneMessage: switch m.Reason { diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index b9cb72d3f..370dd6270 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -666,7 +666,7 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) { // NOTE(andrew-d): I don't love that we're depending on the // health package here, but I'd rather do that and not store // the exact same state in two different places. - GetLastDERPActivity: health.GetDERPRegionReceivedTime, + GetLastDERPActivity: health.Global.GetDERPRegionReceivedTime, }) if err != nil { return nil, err @@ -2471,7 +2471,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur } ruc.setConnLocked(pconn, network, c.bind.BatchSize()) if network == "udp4" { - health.SetUDP4Unbound(false) + health.Global.SetUDP4Unbound(false) } return nil } @@ -2482,7 +2482,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur // we get a link change and we can try binding again. ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize()) if network == "udp4" { - health.SetUDP4Unbound(true) + health.Global.SetUDP4Unbound(true) } return fmt.Errorf("failed to bind any ports (tried %v)", ports) } diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index b009140b9..b075fd4a4 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -3120,14 +3120,14 @@ func TestMaybeSetNearestDERP(t *testing.T) { report := &netcheck.Report{PreferredDERP: tt.reportDERP} - oldConnected := health.GetInPollNetMap() + oldConnected := health.Global.GetInPollNetMap() if tt.connectedToControl != oldConnected { if tt.connectedToControl { - health.GotStreamedMapResponse() - t.Cleanup(health.SetOutOfPollNetMap) + health.Global.GotStreamedMapResponse() + t.Cleanup(health.Global.SetOutOfPollNetMap) } else { - health.SetOutOfPollNetMap() - t.Cleanup(health.GotStreamedMapResponse) + health.Global.SetOutOfPollNetMap() + t.Cleanup(health.Global.GotStreamedMapResponse) } } diff --git a/wgengine/router/ifconfig_windows.go b/wgengine/router/ifconfig_windows.go index 87dfb74bc..54489fc3c 100644 --- a/wgengine/router/ifconfig_windows.go +++ b/wgengine/router/ifconfig_windows.go @@ -235,7 +235,7 @@ func interfaceFromLUID(luid winipcfg.LUID, flags winipcfg.GAAFlags) (*winipcfg.I return nil, fmt.Errorf("interfaceFromLUID: interface with LUID %v not found", luid) } -var networkCategoryWarning = health.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy")) +var networkCategoryWarning = health.Global.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy")) func configureInterface(cfg *Config, tun *tun.NativeTun) (retErr error) { var mtu = tstun.DefaultTUNMTU() diff --git a/wgengine/userspace.go b/wgengine/userspace.go index f7df59f9f..d33cfc6ef 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -970,7 +970,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, e.logf("wgengine: Reconfig: configuring router") e.networkLogger.ReconfigRoutes(routerCfg) err := e.router.Set(routerCfg) - health.SetRouterHealth(err) + health.Global.SetRouterHealth(err) if err != nil { return err } @@ -979,7 +979,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, // assigned address. e.logf("wgengine: Reconfig: configuring DNS") err = e.dns.Set(*dnsCfg) - health.SetDNSHealth(err) + health.Global.SetDNSHealth(err) if err != nil { return err } @@ -1183,7 +1183,7 @@ func (e *userspaceEngine) linkChange(delta *netmon.ChangeDelta) { e.logf("[v1] LinkChange: minor") } - health.SetAnyInterfaceUp(up) + health.Global.SetAnyInterfaceUp(up) e.magicConn.SetNetworkUp(up) if !up || changed { if err := e.dns.FlushCaches(); err != nil {