health, ipn/ipnlocal: track, log overall health
Updates #1505 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
parent
6fbc9b3a98
commit
9eb65601ef
|
@ -179,8 +179,11 @@ func NewNoStart(opts Options) (*Client, error) {
|
|||
|
||||
}
|
||||
|
||||
func (c *Client) onHealthChange(key string, err error) {
|
||||
c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err)
|
||||
func (c *Client) onHealthChange(sys health.Subsystem, err error) {
|
||||
if sys == health.SysOverall {
|
||||
return
|
||||
}
|
||||
c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err)
|
||||
c.cancelMapSafely()
|
||||
}
|
||||
|
||||
|
|
119
health/health.go
119
health/health.go
|
@ -7,9 +7,13 @@
|
|||
package health
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/go-multierror/multierror"
|
||||
"tailscale.com/tailcfg"
|
||||
)
|
||||
|
||||
|
@ -17,8 +21,9 @@ var (
|
|||
// mu guards everything in this var block.
|
||||
mu sync.Mutex
|
||||
|
||||
m = map[string]error{} // error key => err (or nil for no error)
|
||||
watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes
|
||||
sysErr = map[Subsystem]error{} // error key => err (or nil for no error)
|
||||
watchers = map[*watchHandle]func(Subsystem, error){} // opt func to run if error state changes
|
||||
timer *time.Timer
|
||||
|
||||
inMapPoll bool
|
||||
inMapPollSince time.Time
|
||||
|
@ -32,49 +37,77 @@ var (
|
|||
ipnWantRunning bool
|
||||
)
|
||||
|
||||
// Subsystem is the name of a subsystem whose health can be monitored.
|
||||
type Subsystem string
|
||||
|
||||
const (
|
||||
// SysOverall is the name representing the overall health of
|
||||
// the system, rather than one particular subsystem.
|
||||
SysOverall = Subsystem("overall")
|
||||
|
||||
// SysRouter is the name the wgengine/router subsystem.
|
||||
SysRouter = Subsystem("router")
|
||||
|
||||
// SysNetworkCategory is the name of the subsystem that sets
|
||||
// the Windows network adapter's "category" (public, private, domain).
|
||||
// If it's unhealthy, the Windows firewall rules won't match.
|
||||
SysNetworkCategory = Subsystem("network-category")
|
||||
)
|
||||
|
||||
type watchHandle byte
|
||||
|
||||
// RegisterWatcher adds a function that will be called if an
|
||||
// error changes state either to unhealthy or from unhealthy. It is
|
||||
// not called on transition from unknown to healthy. It must be non-nil
|
||||
// and is run in its own goroutine. The returned func unregisters it.
|
||||
func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) {
|
||||
func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
handle := new(watchHandle)
|
||||
watchers[handle] = cb
|
||||
if timer == nil {
|
||||
timer = time.AfterFunc(time.Minute, timerSelfCheck)
|
||||
}
|
||||
return func() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
delete(watchers, handle)
|
||||
if len(watchers) == 0 && timer != nil {
|
||||
timer.Stop()
|
||||
timer = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SetRouter sets the state of the wgengine/router.Router.
|
||||
func SetRouterHealth(err error) { set("router", err) }
|
||||
func SetRouterHealth(err error) { set(SysRouter, err) }
|
||||
|
||||
// RouterHealth returns the wgengine/router.Router error state.
|
||||
func RouterHealth() error { return get("router") }
|
||||
func RouterHealth() error { return get(SysRouter) }
|
||||
|
||||
// SetNetworkCategoryHealth sets the state of setting the network adaptor's category.
|
||||
// This only applies on Windows.
|
||||
func SetNetworkCategoryHealth(err error) { set("network-category", err) }
|
||||
func SetNetworkCategoryHealth(err error) { set(SysNetworkCategory, err) }
|
||||
|
||||
func NetworkCategoryHealth() error { return get("network-category") }
|
||||
func NetworkCategoryHealth() error { return get(SysNetworkCategory) }
|
||||
|
||||
func get(key string) error {
|
||||
func get(key Subsystem) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
return m[key]
|
||||
return sysErr[key]
|
||||
}
|
||||
|
||||
func set(key string, err error) {
|
||||
func set(key Subsystem, err error) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
old, ok := m[key]
|
||||
setLocked(key, err)
|
||||
}
|
||||
|
||||
func setLocked(key Subsystem, err error) {
|
||||
old, ok := sysErr[key]
|
||||
if !ok && err == nil {
|
||||
// Initial happy path.
|
||||
m[key] = nil
|
||||
sysErr[key] = nil
|
||||
selfCheckLocked()
|
||||
return
|
||||
}
|
||||
|
@ -83,11 +116,11 @@ func set(key string, err error) {
|
|||
// don't run callbacks, but exact error might've
|
||||
// changed, so note it.
|
||||
if err != nil {
|
||||
m[key] = err
|
||||
sysErr[key] = err
|
||||
}
|
||||
return
|
||||
}
|
||||
m[key] = err
|
||||
sysErr[key] = err
|
||||
selfCheckLocked()
|
||||
for _, cb := range watchers {
|
||||
go cb(key, err)
|
||||
|
@ -162,14 +195,62 @@ func SetIPNState(state string, wantRunning bool) {
|
|||
selfCheckLocked()
|
||||
}
|
||||
|
||||
func timerSelfCheck() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
selfCheckLocked()
|
||||
if timer != nil {
|
||||
timer.Reset(time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
func selfCheckLocked() {
|
||||
// TODO: check states against each other.
|
||||
// For staticcheck for now:
|
||||
if ipnState == "" {
|
||||
// Don't check yet.
|
||||
return
|
||||
}
|
||||
setLocked(SysOverall, overallErrorLocked())
|
||||
}
|
||||
|
||||
func overallErrorLocked() error {
|
||||
if ipnState != "Running" || !ipnWantRunning {
|
||||
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
|
||||
}
|
||||
now := time.Now()
|
||||
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
|
||||
return errors.New("not in map poll")
|
||||
}
|
||||
const tooIdle = 2*time.Minute + 5*time.Second
|
||||
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
|
||||
return fmt.Errorf("no map response in %v", d)
|
||||
}
|
||||
rid := derpHomeRegion
|
||||
if rid == 0 {
|
||||
return errors.New("no DERP home")
|
||||
}
|
||||
if !derpRegionConnected[rid] {
|
||||
return fmt.Errorf("not connected to home DERP region %v", rid)
|
||||
}
|
||||
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
|
||||
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
|
||||
}
|
||||
|
||||
// TODO: use
|
||||
_ = inMapPollSince
|
||||
_ = lastMapPollEndedAt
|
||||
_ = lastStreamedMapResponse
|
||||
_ = derpHomeRegion
|
||||
_ = lastMapRequestHeard
|
||||
_ = ipnState
|
||||
_ = ipnWantRunning
|
||||
|
||||
var errs []error
|
||||
for sys, err := range sysErr {
|
||||
if err == nil || sys == SysOverall {
|
||||
continue
|
||||
}
|
||||
errs = append(errs, fmt.Errorf("%v: %w", sys, err))
|
||||
}
|
||||
sort.Slice(errs, func(i, j int) bool {
|
||||
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items.
|
||||
return errs[i].Error() < errs[j].Error()
|
||||
})
|
||||
return multierror.New(errs)
|
||||
}
|
||||
|
|
|
@ -65,20 +65,21 @@ func getControlDebugFlags() []string {
|
|||
// state machine generates events back out to zero or more components.
|
||||
type LocalBackend struct {
|
||||
// Elements that are thread-safe or constant after construction.
|
||||
ctx context.Context // canceled by Close
|
||||
ctxCancel context.CancelFunc // cancels ctx
|
||||
logf logger.Logf // general logging
|
||||
keyLogf logger.Logf // for printing list of peers on change
|
||||
statsLogf logger.Logf // for printing peers stats on change
|
||||
e wgengine.Engine
|
||||
store ipn.StateStore
|
||||
backendLogID string
|
||||
unregisterLinkMon func()
|
||||
portpoll *portlist.Poller // may be nil
|
||||
portpollOnce sync.Once // guards starting readPoller
|
||||
gotPortPollRes chan struct{} // closed upon first readPoller result
|
||||
serverURL string // tailcontrol URL
|
||||
newDecompressor func() (controlclient.Decompressor, error)
|
||||
ctx context.Context // canceled by Close
|
||||
ctxCancel context.CancelFunc // cancels ctx
|
||||
logf logger.Logf // general logging
|
||||
keyLogf logger.Logf // for printing list of peers on change
|
||||
statsLogf logger.Logf // for printing peers stats on change
|
||||
e wgengine.Engine
|
||||
store ipn.StateStore
|
||||
backendLogID string
|
||||
unregisterLinkMon func()
|
||||
unregisterHealthWatch func()
|
||||
portpoll *portlist.Poller // may be nil
|
||||
portpollOnce sync.Once // guards starting readPoller
|
||||
gotPortPollRes chan struct{} // closed upon first readPoller result
|
||||
serverURL string // tailcontrol URL
|
||||
newDecompressor func() (controlclient.Decompressor, error)
|
||||
|
||||
filterHash string
|
||||
|
||||
|
@ -148,6 +149,8 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, e wge
|
|||
b.linkChange(false, linkMon.InterfaceState())
|
||||
b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange)
|
||||
|
||||
b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange)
|
||||
|
||||
return b, nil
|
||||
}
|
||||
|
||||
|
@ -182,6 +185,14 @@ func (b *LocalBackend) linkChange(major bool, ifst *interfaces.State) {
|
|||
b.updateFilter(b.netMap, b.prefs)
|
||||
}
|
||||
|
||||
func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) {
|
||||
if err == nil {
|
||||
b.logf("health(%q): ok", sys)
|
||||
} else {
|
||||
b.logf("health(%q): error: %v", sys, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown halts the backend and all its sub-components. The backend
|
||||
// can no longer be used after Shutdown returns.
|
||||
func (b *LocalBackend) Shutdown() {
|
||||
|
@ -190,6 +201,7 @@ func (b *LocalBackend) Shutdown() {
|
|||
b.mu.Unlock()
|
||||
|
||||
b.unregisterLinkMon()
|
||||
b.unregisterHealthWatch()
|
||||
if cli != nil {
|
||||
cli.Shutdown()
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue