wgengine,magicsock: fix two lazy wireguard config issues

1) we weren't waking up a discoEndpoint that once existed and went idle for 5 minutes and then got a disco message again. 2) userspaceEngine.noteReceiveActivity had a buggy check; fixed and added a test
2020-08-06 14:57:03 -07:00 · 2020-08-06 14:57:03 -07:00 · 9a346fd8b4
parent 78c2e1ff83
commit 9a346fd8b4
4 changed files with 153 additions and 27 deletions
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@ -1360,13 +1360,15 @@ func wgRecvAddr(e conn.Endpoint, ipp netaddr.IPPort, addr *net.UDPAddr) *net.UDP
 	return ipp.UDPAddr()
 }

-// noteRecvActivity calls the magicsock.Conn.noteRecvActivity hook if
-// e is a discovery-capable peer.
+// noteRecvActivityFromEndpoint calls the c.noteRecvActivity hook if
+// e is a discovery-capable peer and this is the first receive activity
+// it's got in awhile (in last 10 seconds).
 //
 // This should be called whenever a packet arrives from e.
-func noteRecvActivity(e conn.Endpoint) {
-	if de, ok := e.(*discoEndpoint); ok {
-		de.onRecvActivity()
+func (c *Conn) noteRecvActivityFromEndpoint(e conn.Endpoint) {
+	de, ok := e.(*discoEndpoint)
+	if ok && c.noteRecvActivity != nil && de.isFirstRecvActivityInAwhile() {
+		c.noteRecvActivity(de.discoKey)
 	}
 }

@ -1377,7 +1379,7 @@ Top:
 		c.bufferedIPv4From = netaddr.IPPort{}
 		addr = from.UDPAddr()
 		ep := c.findEndpoint(from, addr)
-		noteRecvActivity(ep)
+		c.noteRecvActivityFromEndpoint(ep)
 		return copy(b, c.bufferedIPv4Packet), ep, wgRecvAddr(ep, from, addr), nil
 	}

@ -1489,7 +1491,7 @@ Top:
 		ep = c.findEndpoint(ipp, addr)
 	}
 	if !didNoteRecvActivity {
-		noteRecvActivity(ep)
+		c.noteRecvActivityFromEndpoint(ep)
 	}
 	return n, ep, wgRecvAddr(ep, ipp, addr), nil
 }
@ -1517,7 +1519,7 @@ func (c *Conn) ReceiveIPv6(b []byte) (int, conn.Endpoint, *net.UDPAddr, error) {
 		}

 		ep := c.findEndpoint(ipp, addr)
-		noteRecvActivity(ep)
+		c.noteRecvActivityFromEndpoint(ep)
 		return n, ep, wgRecvAddr(ep, ipp, addr), nil
 	}
 }
@ -1615,8 +1617,9 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
 		return false
 	}

-	de, ok := c.endpointOfDisco[sender]
-	if !ok {
+	needsRecvActivityCall := false
+	de, endpointFound0 := c.endpointOfDisco[sender]
+	if !endpointFound0 {
 		// We don't have an active endpoint for this sender but we knew about the node, so
 		// it's an idle endpoint that doesn't yet exist in the wireguard config. We now have
 		// to notify the userspace engine (via noteRecvActivity) so wireguard-go can create
@ -1628,6 +1631,11 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
 			c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook")
 			return false
 		}
+		needsRecvActivityCall = true
+	} else {
+		needsRecvActivityCall = de.isFirstRecvActivityInAwhile()
+	}
+	if needsRecvActivityCall && c.noteRecvActivity != nil {
 		// We can't hold Conn.mu while calling noteRecvActivity.
 		// noteRecvActivity acquires userspaceEngine.wgLock (and per our
 		// lock ordering rules: wgLock must come first), and also calls
@ -1651,7 +1659,9 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
 			c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
 			return false
 		}
-		c.logf("magicsock: lazy endpoint created via disco message for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
+		if !endpointFound0 {
+			c.logf("magicsock: lazy endpoint created via disco message for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
+		}
 	}

 	// First, do we even know (and thus care) about this sender? If not,
@ -2676,17 +2686,6 @@ func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, err
 			sentPing:           map[stun.TxID]sentPing{},
 			endpointState:      map[netaddr.IPPort]*endpointState{},
 		}
-		lastRecvTime := new(int64) // atomic
-		de.onRecvActivity = func() {
-			now := time.Now().Unix()
-			old := atomic.LoadInt64(lastRecvTime)
-			if old == 0 || old <= now-10 {
-				atomic.StoreInt64(lastRecvTime, now)
-				if c.noteRecvActivity != nil {
-					c.noteRecvActivity(de.discoKey)
-				}
-			}
-		}
 		de.initFakeUDPAddr()
 		de.updateFromNode(c.nodeOfDisco[de.discoKey])
 		c.endpointOfDisco[de.discoKey] = de
@ -2958,6 +2957,9 @@ func udpAddrDebugString(ua net.UDPAddr) string {
 // discoEndpoint is a wireguard/conn.Endpoint for new-style peers that
 // advertise a DiscoKey and participate in active discovery.
 type discoEndpoint struct {
+	// atomically accessed; declared first for alignment reasons
+	lastRecvUnixAtomic int64
+
 	// These fields are initialized once and never modified.
 	c                  *Conn
 	publicKey          tailcfg.NodeKey  // peer public key (for WireGuard + DERP)
@ -2966,7 +2968,6 @@ type discoEndpoint struct {
 	fakeWGAddr         netaddr.IPPort   // the UDP address we tell wireguard-go we're using
 	fakeWGAddrStd      *net.UDPAddr     // the *net.UDPAddr form of fakeWGAddr
 	wgEndpointHostPort string           // string from CreateEndpoint: "<hex-discovery-key>.disco.tailscale:12345"
-	onRecvActivity     func()

 	// Owned by Conn.mu:
 	lastPingFrom netaddr.IPPort
@ -3063,6 +3064,19 @@ func (de *discoEndpoint) initFakeUDPAddr() {
 	de.fakeWGAddrStd = de.fakeWGAddr.UDPAddr()
 }

+// isFirstRecvActivityInAwhile notes that receive activity has occured for this
+// endpoint and reports whether it's been at least 10 seconds since the last
+// receive activity (including having never received from this peer before).
+func (de *discoEndpoint) isFirstRecvActivityInAwhile() bool {
+	now := time.Now().Unix()
+	old := atomic.LoadInt64(&de.lastRecvUnixAtomic)
+	if old <= now-10 {
+		atomic.StoreInt64(&de.lastRecvUnixAtomic, now)
+		return true
+	}
+	return false
+}
+
 // String exists purely so wireguard-go internals can log.Printf("%v")
 // its internal conn.Endpoints and we don't end up with data races
 // from fmt (via log) reading mutex fields and such.
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@ -22,6 +22,7 @@ import (
 	"sync"
 	"testing"
 	"time"
+	"unsafe"

 	"github.com/google/go-cmp/cmp"
 	"github.com/tailscale/wireguard-go/device"
@ -1322,3 +1323,17 @@ func stringifyConfig(cfg wgcfg.Config) string {
 	}
 	return string(j)
 }
+
+func TestDiscoEndpointAlignment(t *testing.T) {
+	var de discoEndpoint
+	off := unsafe.Offsetof(de.lastRecvUnixAtomic)
+	if off%8 != 0 {
+		t.Fatalf("lastRecvUnixAtomic is not 8-byte aligned")
+	}
+	if !de.isFirstRecvActivityInAwhile() { // verify this doesn't panic on 32-bit
+		t.Error("expected true")
+	}
+	if de.isFirstRecvActivityInAwhile() {
+		t.Error("expected false on second call")
+	}
+}
--- a/wgengine/userspace.go
+++ b/wgengine/userspace.go
@ -87,6 +87,7 @@ type userspaceEngine struct {
 	logf      logger.Logf
 	reqCh     chan struct{}
 	waitCh    chan struct{} // chan is closed when first Close call completes; contrast with closing bool
+	timeNow   func() time.Time
 	tundev    *tstun.TUN
 	wgdev     *device.Device
 	router    router.Router
@ -94,6 +95,8 @@ type userspaceEngine struct {
 	magicConn *magicsock.Conn
 	linkMon   *monitor.Mon

+	testMaybeReconfigHook func() // for tests; if non-nil, fires if maybeReconfigWireguardLocked called
+
 	// localAddrs is the set of IP addresses assigned to the local
 	// tunnel interface. It's used to reflect local packets
 	// incorrectly sent to us.
@ -199,6 +202,7 @@ func newUserspaceEngineAdvanced(conf EngineConfig) (_ Engine, reterr error) {
 	logf := conf.Logf

 	e := &userspaceEngine{
+		timeNow:  time.Now,
 		logf:     logf,
 		reqCh:    make(chan struct{}, 1),
 		waitCh:   make(chan struct{}),
@ -622,12 +626,12 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
 	e.wgLock.Lock()
 	defer e.wgLock.Unlock()

-	now := time.Now()
 	was, ok := e.recvActivityAt[dk]
 	if !ok {
 		// Not a trimmable peer we care about tracking. (See isTrimmablePeer)
 		return
 	}
+	now := e.timeNow()
 	e.recvActivityAt[dk] = now

 	// If the last activity time jumped a bunch (say, at least
@ -636,7 +640,7 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
 	// lazyPeerIdleThreshold without the divide by 2, but
 	// maybeReconfigWireguardLocked is cheap enough to call every
 	// couple minutes (just not on every packet).
-	if was.IsZero() || now.Sub(was) < -lazyPeerIdleThreshold/2 {
+	if was.IsZero() || now.Sub(was) > lazyPeerIdleThreshold/2 {
 		e.maybeReconfigWireguardLocked()
 	}
 }
@ -677,6 +681,11 @@ func discoKeyFromPeer(p *wgcfg.Peer) tailcfg.DiscoKey {

 // e.wgLock must be held.
 func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
+	if hook := e.testMaybeReconfigHook; hook != nil {
+		hook()
+		return nil
+	}
+
 	full := e.lastCfgFull

 	// Compute a minimal config to pass to wireguard-go
@ -689,7 +698,7 @@ func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
 	// the past 5 minutes. That's more than WireGuard's key
 	// rotation time anyway so it's no harm if we remove it
 	// later if it's been inactive.
-	activeCutoff := time.Now().Add(-lazyPeerIdleThreshold)
+	activeCutoff := e.timeNow().Add(-lazyPeerIdleThreshold)

 	// Not all peers can be trimmed from the network map (see
 	// isTrimmablePeer).  For those are are trimmable, keep track
@ -765,7 +774,7 @@ func (e *userspaceEngine) updateActivityMapsLocked(trackDisco []tailcfg.DiscoKey
 		if fn == nil {
 			// This is the func that gets run on every outgoing packet for tracked IPs:
 			fn = func() {
-				now := time.Now().Unix()
+				now := e.timeNow().Unix()
 				old := atomic.LoadInt64(timePtr)

 				// How long's it been since we last sent a packet?
--- a/wgengine/userspace_test.go
+++ b/wgengine/userspace_test.go
@ -0,0 +1,88 @@
+// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package wgengine
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+	"time"
+
+	"tailscale.com/tailcfg"
+	"tailscale.com/types/key"
+	"tailscale.com/wgengine/tstun"
+)
+
+func TestNoteReceiveActivity(t *testing.T) {
+	now := time.Unix(1, 0)
+	tick := func(d time.Duration) { now = now.Add(d) }
+	var logBuf bytes.Buffer
+
+	confc := make(chan bool, 1)
+	gotConf := func() bool {
+		select {
+		case <-confc:
+			return true
+		default:
+			return false
+		}
+	}
+	e := &userspaceEngine{
+		timeNow:        func() time.Time { return now },
+		recvActivityAt: map[tailcfg.DiscoKey]time.Time{},
+		logf: func(format string, a ...interface{}) {
+			fmt.Fprintf(&logBuf, format, a...)
+		},
+		tundev:                new(tstun.TUN),
+		testMaybeReconfigHook: func() { confc <- true },
+	}
+	ra := e.recvActivityAt
+
+	dk := tailcfg.DiscoKey(key.NewPrivate().Public())
+
+	// Activity on an untracked key should do nothing.
+	e.noteReceiveActivity(dk)
+	if len(ra) != 0 {
+		t.Fatalf("unexpected growth in map: now has %d keys; want 0", len(ra))
+	}
+	if logBuf.Len() != 0 {
+		t.Fatalf("unexpected log write (and thus activity): %s", logBuf.Bytes())
+	}
+
+	// Now track it and expect updates.
+	ra[dk] = time.Time{}
+	e.noteReceiveActivity(dk)
+	if len(ra) != 1 {
+		t.Fatalf("unexpected growth in map: now has %d keys; want 1", len(ra))
+	}
+	if got := ra[dk]; got != now {
+		t.Fatalf("time in map = %v; want %v", got, now)
+	}
+	if !gotConf() {
+		t.Fatalf("didn't get expected reconfig")
+	}
+
+	// With updates 1 second apart, don't expect a reconfig.
+	for i := 0; i < 300; i++ {
+		tick(time.Second)
+		e.noteReceiveActivity(dk)
+		if len(ra) != 1 {
+			t.Fatalf("map len = %d; want 1", len(ra))
+		}
+		if got := ra[dk]; got != now {
+			t.Fatalf("time in map = %v; want %v", got, now)
+		}
+		if gotConf() {
+			t.Fatalf("unexpected reconfig")
+		}
+	}
+
+	// But if there's a big jump it should get an update.
+	tick(3 * time.Minute)
+	e.noteReceiveActivity(dk)
+	if !gotConf() {
+		t.Fatalf("expected config")
+	}
+}