wgengine/magicsock: clear out endpoint statistics when it becomes bad

There are cases where we do not detect the non-viability of a route, but
we will instead observe a failure to send. In a Disco path this would
normally be handled as a side effect of Disco, which is not available to
non-Disco WireGuard nodes. In both cases, recognizing the failure as
such will result in faster convergence.

Updates #8999
Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2023-08-21 16:04:07 -07:00 committed by James Tucker
parent 7c9c68feed
commit 5edb39d032
3 changed files with 71 additions and 5 deletions

View File

@ -469,6 +469,14 @@ func (de *endpoint) send(buffs [][]byte) error {
var err error
if udpAddr.IsValid() {
_, err = de.c.sendUDPBatch(udpAddr, buffs)
// If the error is known to indicate that the endpoint is no longer
// usable, clear the endpoint statistics so that the next send will
// re-evaluate the best endpoint.
if err != nil && isBadEndpointErr(err) {
de.noteBadEndpoint(udpAddr)
}
// TODO(raggi): needs updating for accuracy, as in error conditions we may have partial sends.
if stats := de.c.stats.Load(); err == nil && stats != nil {
var txBytes int
@ -866,6 +874,30 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.T
return false
}
// clearBestAddrLocked clears the bestAddr and related fields such that future
// packets will re-evaluate the best address to send to next.
//
// de.mu must be held.
func (de *endpoint) clearBestAddrLocked() {
de.bestAddr = addrLatency{}
de.bestAddrAt = 0
de.trustBestAddrUntil = 0
}
// noteBadEndpoint marks ipp as a bad endpoint that would need to be
// re-evaluated before future use, this should be called for example if a send
// to ipp fails due to a host unreachable error or similar.
func (de *endpoint) noteBadEndpoint(ipp netip.AddrPort) {
de.mu.Lock()
defer de.mu.Unlock()
de.clearBestAddrLocked()
if st, ok := de.endpointState[ipp]; ok {
st.clear()
}
}
// noteConnectivityChange is called when connectivity changes enough
// that we should question our earlier assumptions about which paths
// work.
@ -873,8 +905,7 @@ func (de *endpoint) noteConnectivityChange() {
de.mu.Lock()
defer de.mu.Unlock()
de.bestAddr = addrLatency{}
de.trustBestAddrUntil = 0
de.clearBestAddrLocked()
for k := range de.endpointState {
de.endpointState[k].clear()
@ -1156,9 +1187,7 @@ func (de *endpoint) stopAndReset() {
func (de *endpoint) resetLocked() {
de.lastSend = 0
de.lastFullPing = 0
de.bestAddr = addrLatency{}
de.bestAddrAt = 0
de.trustBestAddrUntil = 0
de.clearBestAddrLocked()
for _, es := range de.endpointState {
es.lastPing = 0
}

View File

@ -0,0 +1,23 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !js && !wasm
// +build !js,!wasm
package magicsock
import (
"errors"
"syscall"
)
// errHOSTUNREACH wraps unix.EHOSTUNREACH in an interface type to pass to
// errors.Is while avoiding an allocation per call.
var errHOSTUNREACH error = syscall.EHOSTUNREACH
// isBadEndpointErr checks if err is one which is known to report that an
// endpoint can no longer be sent to. It is not exhaustive, and for unknown
// errors always reports false.
func isBadEndpointErr(err error) bool {
return errors.Is(err, errHOSTUNREACH)
}

View File

@ -0,0 +1,14 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build js || wasm
// +build js wasm
package magicsock
// isBadEndpointErr checks if err is one which is known to report that an
// endpoint can no longer be sent to. It is not exhaustive, but covers known
// cases.
func isBadEndpointErr(err error) bool {
return false
}