health, wgenegine: fix receive func health checks for the fourth time
The old implementation knew too much about how wireguard-go worked. As a result, it missed genuine problems that occurred due to unrelated bugs. This fourth attempt to fix the health checks takes a black box approach. A receive func is healthy if one (or both) of these conditions holds: * It is currently running and blocked. * It has been executed recently. The second condition is required because receive functions are not continuously executing. wireguard-go calls them and then processes their results before calling them again. There is a theoretical false positive if wireguard-go go takes longer than one minute to process the results of a receive func execution. If that happens, we have other problems. Updates #1790 Signed-off-by: Josh Bleecher Snyder <josharian@gmail.com>
This commit is contained in:
parent
0d4c8cb2e1
commit
744de615f1
|
@ -11,6 +11,7 @@ import (
|
|||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/go-multierror/multierror"
|
||||
|
@ -216,6 +217,7 @@ func SetAnyInterfaceUp(up bool) {
|
|||
func timerSelfCheck() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
checkReceiveFuncs()
|
||||
selfCheckLocked()
|
||||
if timer != nil {
|
||||
timer.Reset(time.Minute)
|
||||
|
@ -263,6 +265,11 @@ func overallErrorLocked() error {
|
|||
_ = lastMapRequestHeard
|
||||
|
||||
var errs []error
|
||||
for _, recv := range receiveFuncs {
|
||||
if recv.missing {
|
||||
errs = append(errs, fmt.Errorf("%s is not running", recv.name))
|
||||
}
|
||||
}
|
||||
for sys, err := range sysErr {
|
||||
if err == nil || sys == SysOverall {
|
||||
continue
|
||||
|
@ -275,3 +282,58 @@ func overallErrorLocked() error {
|
|||
})
|
||||
return multierror.New(errs)
|
||||
}
|
||||
|
||||
var (
|
||||
ReceiveIPv4 = ReceiveFuncStats{name: "ReceiveIPv4"}
|
||||
// ReceiveIPv6 isn't guaranteed to be running, so skip it for now.
|
||||
ReceiveDERP = ReceiveFuncStats{name: "ReceiveDERP"}
|
||||
|
||||
receiveFuncs = []*ReceiveFuncStats{&ReceiveIPv4, &ReceiveDERP}
|
||||
)
|
||||
|
||||
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func.
|
||||
type ReceiveFuncStats struct {
|
||||
// name is the name of the receive func.
|
||||
name string
|
||||
// numCalls is the number of times the receive func has ever been called.
|
||||
// It is required because it is possible for a receive func's wireguard-go goroutine
|
||||
// to be active even though the receive func isn't.
|
||||
// The wireguard-go goroutine alternates between calling the receive func and
|
||||
// processing what the func returned.
|
||||
numCalls uint64 // accessed atomically
|
||||
// prevNumCalls is the value of numCalls last time the health check examined it.
|
||||
prevNumCalls uint64
|
||||
// inCall indicates whether the receive func is currently running.
|
||||
inCall uint32 // bool, accessed atomically
|
||||
// missing indicates whether the receive func is not running.
|
||||
missing bool
|
||||
}
|
||||
|
||||
func (s *ReceiveFuncStats) Enter() {
|
||||
atomic.AddUint64(&s.numCalls, 1)
|
||||
atomic.StoreUint32(&s.inCall, 1)
|
||||
}
|
||||
|
||||
func (s *ReceiveFuncStats) Exit() {
|
||||
atomic.StoreUint32(&s.inCall, 0)
|
||||
}
|
||||
|
||||
func checkReceiveFuncs() {
|
||||
for _, recv := range receiveFuncs {
|
||||
recv.missing = false
|
||||
prev := recv.prevNumCalls
|
||||
numCalls := atomic.LoadUint64(&recv.numCalls)
|
||||
recv.prevNumCalls = numCalls
|
||||
if numCalls > prev {
|
||||
// OK: the function has gotten called since last we checked
|
||||
continue
|
||||
}
|
||||
if atomic.LoadUint32(&recv.inCall) == 1 {
|
||||
// OK: the function is active, probably blocked due to inactivity
|
||||
continue
|
||||
}
|
||||
// Not OK: The function is not active, and not accumulating new calls.
|
||||
// It is probably MIA.
|
||||
recv.missing = true
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1594,6 +1594,8 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) {
|
|||
|
||||
// receiveIPv4 receives a UDP IPv4 packet. It is called by wireguard-go.
|
||||
func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
|
||||
health.ReceiveIPv4.Enter()
|
||||
defer health.ReceiveIPv4.Exit()
|
||||
for {
|
||||
n, ipp, err := c.pconn4.ReadFromNetaddr(b)
|
||||
if err != nil {
|
||||
|
@ -1646,6 +1648,8 @@ func (c *Conn) receiveIP(b []byte, ipp netaddr.IPPort, cache *ippEndpointCache)
|
|||
// If the packet was a disco message or the peer endpoint wasn't
|
||||
// found, the returned error is errLoopAgain.
|
||||
func (c *connBind) receiveDERP(b []byte) (n int, ep conn.Endpoint, err error) {
|
||||
health.ReceiveDERP.Enter()
|
||||
defer health.ReceiveDERP.Exit()
|
||||
for dm := range c.derpRecvCh {
|
||||
if c.Closed() {
|
||||
break
|
||||
|
|
Loading…
Reference in New Issue