wgengine,magicsock: fix two lazy wireguard config issues

1) we weren't waking up a discoEndpoint that once existed and
   went idle for 5 minutes and then got a disco message again.

2) userspaceEngine.noteReceiveActivity had a buggy check; fixed
   and added a test
This commit is contained in:
Brad Fitzpatrick 2020-08-06 14:57:03 -07:00
parent 78c2e1ff83
commit 9a346fd8b4
4 changed files with 153 additions and 27 deletions

View File

@ -1360,13 +1360,15 @@ func wgRecvAddr(e conn.Endpoint, ipp netaddr.IPPort, addr *net.UDPAddr) *net.UDP
return ipp.UDPAddr()
}
// noteRecvActivity calls the magicsock.Conn.noteRecvActivity hook if
// e is a discovery-capable peer.
// noteRecvActivityFromEndpoint calls the c.noteRecvActivity hook if
// e is a discovery-capable peer and this is the first receive activity
// it's got in awhile (in last 10 seconds).
//
// This should be called whenever a packet arrives from e.
func noteRecvActivity(e conn.Endpoint) {
if de, ok := e.(*discoEndpoint); ok {
de.onRecvActivity()
func (c *Conn) noteRecvActivityFromEndpoint(e conn.Endpoint) {
de, ok := e.(*discoEndpoint)
if ok && c.noteRecvActivity != nil && de.isFirstRecvActivityInAwhile() {
c.noteRecvActivity(de.discoKey)
}
}
@ -1377,7 +1379,7 @@ Top:
c.bufferedIPv4From = netaddr.IPPort{}
addr = from.UDPAddr()
ep := c.findEndpoint(from, addr)
noteRecvActivity(ep)
c.noteRecvActivityFromEndpoint(ep)
return copy(b, c.bufferedIPv4Packet), ep, wgRecvAddr(ep, from, addr), nil
}
@ -1489,7 +1491,7 @@ Top:
ep = c.findEndpoint(ipp, addr)
}
if !didNoteRecvActivity {
noteRecvActivity(ep)
c.noteRecvActivityFromEndpoint(ep)
}
return n, ep, wgRecvAddr(ep, ipp, addr), nil
}
@ -1517,7 +1519,7 @@ func (c *Conn) ReceiveIPv6(b []byte) (int, conn.Endpoint, *net.UDPAddr, error) {
}
ep := c.findEndpoint(ipp, addr)
noteRecvActivity(ep)
c.noteRecvActivityFromEndpoint(ep)
return n, ep, wgRecvAddr(ep, ipp, addr), nil
}
}
@ -1615,8 +1617,9 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
return false
}
de, ok := c.endpointOfDisco[sender]
if !ok {
needsRecvActivityCall := false
de, endpointFound0 := c.endpointOfDisco[sender]
if !endpointFound0 {
// We don't have an active endpoint for this sender but we knew about the node, so
// it's an idle endpoint that doesn't yet exist in the wireguard config. We now have
// to notify the userspace engine (via noteRecvActivity) so wireguard-go can create
@ -1628,6 +1631,11 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook")
return false
}
needsRecvActivityCall = true
} else {
needsRecvActivityCall = de.isFirstRecvActivityInAwhile()
}
if needsRecvActivityCall && c.noteRecvActivity != nil {
// We can't hold Conn.mu while calling noteRecvActivity.
// noteRecvActivity acquires userspaceEngine.wgLock (and per our
// lock ordering rules: wgLock must come first), and also calls
@ -1651,7 +1659,9 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
return false
}
c.logf("magicsock: lazy endpoint created via disco message for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
if !endpointFound0 {
c.logf("magicsock: lazy endpoint created via disco message for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
}
}
// First, do we even know (and thus care) about this sender? If not,
@ -2676,17 +2686,6 @@ func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, err
sentPing: map[stun.TxID]sentPing{},
endpointState: map[netaddr.IPPort]*endpointState{},
}
lastRecvTime := new(int64) // atomic
de.onRecvActivity = func() {
now := time.Now().Unix()
old := atomic.LoadInt64(lastRecvTime)
if old == 0 || old <= now-10 {
atomic.StoreInt64(lastRecvTime, now)
if c.noteRecvActivity != nil {
c.noteRecvActivity(de.discoKey)
}
}
}
de.initFakeUDPAddr()
de.updateFromNode(c.nodeOfDisco[de.discoKey])
c.endpointOfDisco[de.discoKey] = de
@ -2958,6 +2957,9 @@ func udpAddrDebugString(ua net.UDPAddr) string {
// discoEndpoint is a wireguard/conn.Endpoint for new-style peers that
// advertise a DiscoKey and participate in active discovery.
type discoEndpoint struct {
// atomically accessed; declared first for alignment reasons
lastRecvUnixAtomic int64
// These fields are initialized once and never modified.
c *Conn
publicKey tailcfg.NodeKey // peer public key (for WireGuard + DERP)
@ -2966,7 +2968,6 @@ type discoEndpoint struct {
fakeWGAddr netaddr.IPPort // the UDP address we tell wireguard-go we're using
fakeWGAddrStd *net.UDPAddr // the *net.UDPAddr form of fakeWGAddr
wgEndpointHostPort string // string from CreateEndpoint: "<hex-discovery-key>.disco.tailscale:12345"
onRecvActivity func()
// Owned by Conn.mu:
lastPingFrom netaddr.IPPort
@ -3063,6 +3064,19 @@ func (de *discoEndpoint) initFakeUDPAddr() {
de.fakeWGAddrStd = de.fakeWGAddr.UDPAddr()
}
// isFirstRecvActivityInAwhile notes that receive activity has occured for this
// endpoint and reports whether it's been at least 10 seconds since the last
// receive activity (including having never received from this peer before).
func (de *discoEndpoint) isFirstRecvActivityInAwhile() bool {
now := time.Now().Unix()
old := atomic.LoadInt64(&de.lastRecvUnixAtomic)
if old <= now-10 {
atomic.StoreInt64(&de.lastRecvUnixAtomic, now)
return true
}
return false
}
// String exists purely so wireguard-go internals can log.Printf("%v")
// its internal conn.Endpoints and we don't end up with data races
// from fmt (via log) reading mutex fields and such.

View File

@ -22,6 +22,7 @@ import (
"sync"
"testing"
"time"
"unsafe"
"github.com/google/go-cmp/cmp"
"github.com/tailscale/wireguard-go/device"
@ -1322,3 +1323,17 @@ func stringifyConfig(cfg wgcfg.Config) string {
}
return string(j)
}
func TestDiscoEndpointAlignment(t *testing.T) {
var de discoEndpoint
off := unsafe.Offsetof(de.lastRecvUnixAtomic)
if off%8 != 0 {
t.Fatalf("lastRecvUnixAtomic is not 8-byte aligned")
}
if !de.isFirstRecvActivityInAwhile() { // verify this doesn't panic on 32-bit
t.Error("expected true")
}
if de.isFirstRecvActivityInAwhile() {
t.Error("expected false on second call")
}
}

View File

@ -87,6 +87,7 @@ type userspaceEngine struct {
logf logger.Logf
reqCh chan struct{}
waitCh chan struct{} // chan is closed when first Close call completes; contrast with closing bool
timeNow func() time.Time
tundev *tstun.TUN
wgdev *device.Device
router router.Router
@ -94,6 +95,8 @@ type userspaceEngine struct {
magicConn *magicsock.Conn
linkMon *monitor.Mon
testMaybeReconfigHook func() // for tests; if non-nil, fires if maybeReconfigWireguardLocked called
// localAddrs is the set of IP addresses assigned to the local
// tunnel interface. It's used to reflect local packets
// incorrectly sent to us.
@ -199,6 +202,7 @@ func newUserspaceEngineAdvanced(conf EngineConfig) (_ Engine, reterr error) {
logf := conf.Logf
e := &userspaceEngine{
timeNow: time.Now,
logf: logf,
reqCh: make(chan struct{}, 1),
waitCh: make(chan struct{}),
@ -622,12 +626,12 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
e.wgLock.Lock()
defer e.wgLock.Unlock()
now := time.Now()
was, ok := e.recvActivityAt[dk]
if !ok {
// Not a trimmable peer we care about tracking. (See isTrimmablePeer)
return
}
now := e.timeNow()
e.recvActivityAt[dk] = now
// If the last activity time jumped a bunch (say, at least
@ -636,7 +640,7 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
// lazyPeerIdleThreshold without the divide by 2, but
// maybeReconfigWireguardLocked is cheap enough to call every
// couple minutes (just not on every packet).
if was.IsZero() || now.Sub(was) < -lazyPeerIdleThreshold/2 {
if was.IsZero() || now.Sub(was) > lazyPeerIdleThreshold/2 {
e.maybeReconfigWireguardLocked()
}
}
@ -677,6 +681,11 @@ func discoKeyFromPeer(p *wgcfg.Peer) tailcfg.DiscoKey {
// e.wgLock must be held.
func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
if hook := e.testMaybeReconfigHook; hook != nil {
hook()
return nil
}
full := e.lastCfgFull
// Compute a minimal config to pass to wireguard-go
@ -689,7 +698,7 @@ func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
// the past 5 minutes. That's more than WireGuard's key
// rotation time anyway so it's no harm if we remove it
// later if it's been inactive.
activeCutoff := time.Now().Add(-lazyPeerIdleThreshold)
activeCutoff := e.timeNow().Add(-lazyPeerIdleThreshold)
// Not all peers can be trimmed from the network map (see
// isTrimmablePeer). For those are are trimmable, keep track
@ -765,7 +774,7 @@ func (e *userspaceEngine) updateActivityMapsLocked(trackDisco []tailcfg.DiscoKey
if fn == nil {
// This is the func that gets run on every outgoing packet for tracked IPs:
fn = func() {
now := time.Now().Unix()
now := e.timeNow().Unix()
old := atomic.LoadInt64(timePtr)
// How long's it been since we last sent a packet?

View File

@ -0,0 +1,88 @@
// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package wgengine
import (
"bytes"
"fmt"
"testing"
"time"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/wgengine/tstun"
)
func TestNoteReceiveActivity(t *testing.T) {
now := time.Unix(1, 0)
tick := func(d time.Duration) { now = now.Add(d) }
var logBuf bytes.Buffer
confc := make(chan bool, 1)
gotConf := func() bool {
select {
case <-confc:
return true
default:
return false
}
}
e := &userspaceEngine{
timeNow: func() time.Time { return now },
recvActivityAt: map[tailcfg.DiscoKey]time.Time{},
logf: func(format string, a ...interface{}) {
fmt.Fprintf(&logBuf, format, a...)
},
tundev: new(tstun.TUN),
testMaybeReconfigHook: func() { confc <- true },
}
ra := e.recvActivityAt
dk := tailcfg.DiscoKey(key.NewPrivate().Public())
// Activity on an untracked key should do nothing.
e.noteReceiveActivity(dk)
if len(ra) != 0 {
t.Fatalf("unexpected growth in map: now has %d keys; want 0", len(ra))
}
if logBuf.Len() != 0 {
t.Fatalf("unexpected log write (and thus activity): %s", logBuf.Bytes())
}
// Now track it and expect updates.
ra[dk] = time.Time{}
e.noteReceiveActivity(dk)
if len(ra) != 1 {
t.Fatalf("unexpected growth in map: now has %d keys; want 1", len(ra))
}
if got := ra[dk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if !gotConf() {
t.Fatalf("didn't get expected reconfig")
}
// With updates 1 second apart, don't expect a reconfig.
for i := 0; i < 300; i++ {
tick(time.Second)
e.noteReceiveActivity(dk)
if len(ra) != 1 {
t.Fatalf("map len = %d; want 1", len(ra))
}
if got := ra[dk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if gotConf() {
t.Fatalf("unexpected reconfig")
}
}
// But if there's a big jump it should get an update.
tick(3 * time.Minute)
e.noteReceiveActivity(dk)
if !gotConf() {
t.Fatalf("expected config")
}
}