health, wgenegine: fix receive func health checks for the fourth time

The old implementation knew too much about how wireguard-go worked.
As a result, it missed genuine problems that occurred due to unrelated bugs.

This fourth attempt to fix the health checks takes a black box approach.
A receive func is healthy if one (or both) of these conditions holds:

* It is currently running and blocked.
* It has been executed recently.

The second condition is required because receive functions
are not continuously executing. wireguard-go calls them and then
processes their results before calling them again.

There is a theoretical false positive if wireguard-go go takes
longer than one minute to process the results of a receive func execution.
If that happens, we have other problems.

Updates #1790

Signed-off-by: Josh Bleecher Snyder <josharian@gmail.com>
This commit is contained in:
Josh Bleecher Snyder 2021-04-26 17:08:05 -07:00
parent 0d4c8cb2e1
commit 744de615f1
2 changed files with 66 additions and 0 deletions

View File

@ -11,6 +11,7 @@ import (
"fmt"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/go-multierror/multierror"
@ -216,6 +217,7 @@ func SetAnyInterfaceUp(up bool) {
func timerSelfCheck() {
mu.Lock()
defer mu.Unlock()
checkReceiveFuncs()
selfCheckLocked()
if timer != nil {
timer.Reset(time.Minute)
@ -263,6 +265,11 @@ func overallErrorLocked() error {
_ = lastMapRequestHeard
var errs []error
for _, recv := range receiveFuncs {
if recv.missing {
errs = append(errs, fmt.Errorf("%s is not running", recv.name))
}
}
for sys, err := range sysErr {
if err == nil || sys == SysOverall {
continue
@ -275,3 +282,58 @@ func overallErrorLocked() error {
})
return multierror.New(errs)
}
var (
ReceiveIPv4 = ReceiveFuncStats{name: "ReceiveIPv4"}
// ReceiveIPv6 isn't guaranteed to be running, so skip it for now.
ReceiveDERP = ReceiveFuncStats{name: "ReceiveDERP"}
receiveFuncs = []*ReceiveFuncStats{&ReceiveIPv4, &ReceiveDERP}
)
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func.
type ReceiveFuncStats struct {
// name is the name of the receive func.
name string
// numCalls is the number of times the receive func has ever been called.
// It is required because it is possible for a receive func's wireguard-go goroutine
// to be active even though the receive func isn't.
// The wireguard-go goroutine alternates between calling the receive func and
// processing what the func returned.
numCalls uint64 // accessed atomically
// prevNumCalls is the value of numCalls last time the health check examined it.
prevNumCalls uint64
// inCall indicates whether the receive func is currently running.
inCall uint32 // bool, accessed atomically
// missing indicates whether the receive func is not running.
missing bool
}
func (s *ReceiveFuncStats) Enter() {
atomic.AddUint64(&s.numCalls, 1)
atomic.StoreUint32(&s.inCall, 1)
}
func (s *ReceiveFuncStats) Exit() {
atomic.StoreUint32(&s.inCall, 0)
}
func checkReceiveFuncs() {
for _, recv := range receiveFuncs {
recv.missing = false
prev := recv.prevNumCalls
numCalls := atomic.LoadUint64(&recv.numCalls)
recv.prevNumCalls = numCalls
if numCalls > prev {
// OK: the function has gotten called since last we checked
continue
}
if atomic.LoadUint32(&recv.inCall) == 1 {
// OK: the function is active, probably blocked due to inactivity
continue
}
// Not OK: The function is not active, and not accumulating new calls.
// It is probably MIA.
recv.missing = true
}
}

View File

@ -1594,6 +1594,8 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) {
// receiveIPv4 receives a UDP IPv4 packet. It is called by wireguard-go.
func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
health.ReceiveIPv4.Enter()
defer health.ReceiveIPv4.Exit()
for {
n, ipp, err := c.pconn4.ReadFromNetaddr(b)
if err != nil {
@ -1646,6 +1648,8 @@ func (c *Conn) receiveIP(b []byte, ipp netaddr.IPPort, cache *ippEndpointCache)
// If the packet was a disco message or the peer endpoint wasn't
// found, the returned error is errLoopAgain.
func (c *connBind) receiveDERP(b []byte) (n int, ep conn.Endpoint, err error) {
health.ReceiveDERP.Enter()
defer health.ReceiveDERP.Exit()
for dm := range c.derpRecvCh {
if c.Closed() {
break