From cff737786ed9907fa3c9da9d6001b5d6c8f1a315 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Thu, 6 Aug 2020 08:43:48 -0700 Subject: [PATCH] wgengine/magicsock: fix lazy config deadlock, document more lock ordering This removes the atomic bool that tried to track whether we needed to acquire the lock on a future recursive call back into magicsock. Unfortunately that hack doesn't work because we also had a lock ordering issue between magicsock and userspaceEngine (see issue). This documents that too. Fixes #644 --- wgengine/magicsock/magicsock.go | 45 ++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index bd80e67ee..16ef49915 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -128,13 +128,6 @@ type Conn struct { mu sync.Mutex // guards all following fields; see userspaceEngine lock ordering rules muCond *sync.Cond - // canCreateEPUnlocked tracks at one place whether mu is - // already held. It's then checked in CreateEndpoint to avoid - // double-locking mu and thus deadlocking. mu should be held - // while setting this; but can be read without mu held. - // TODO(bradfitz): delete this shameful hack; refactor the one use - canCreateEPUnlocked syncs.AtomicBool - started bool // Start was called closed bool // Close was called @@ -289,8 +282,10 @@ type Options struct { // sole user just doesn't need or want it called on every // packet, just every minute or two for Wireguard timeouts, // and 10 seconds seems like a good trade-off between often - // enough and not too often.) The provided func likely calls - // Conn.CreateEndpoint, which acquires Conn.mu. + // enough and not too often.) The provided func is called while + // holding userspaceEngine.wgLock and likely calls + // Conn.CreateEndpoint, which acquires Conn.mu. As such, you should + // not hold NoteRecvActivity func(tailcfg.DiscoKey) } @@ -1633,16 +1628,26 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool { c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook") return false } - // noteRecvActivity calls back into CreateEndpoint, which we can't easily control, - // and CreateEndpoint expects to be called with c.mu held, but we hold it here, and - // it's too invasive for now to release it here and recheck invariants. So instead, - // use this unfortunate hack: set canCreateEPUnlocked which CreateEndpoint then - // checks to conditionally acquire the mutex. I'm so sorry. - c.canCreateEPUnlocked.Set(true) + // We can't hold Conn.mu while calling noteRecvActivity. + // noteRecvActivity acquires userspaceEngine.wgLock (and per our + // lock ordering rules: wgLock must come first), and also calls + // back into our Conn.CreateEndpoint, which would double-acquire + // Conn.mu. + c.mu.Unlock() c.noteRecvActivity(sender) - c.canCreateEPUnlocked.Set(false) + c.mu.Lock() // re-acquire + + // Now, recheck invariants that might've changed while we'd + // released the lock, which isn't much: + if c.closed || c.privateKey.IsZero() { + return true + } de, ok = c.endpointOfDisco[sender] if !ok { + if _, ok := c.nodeOfDisco[sender]; !ok { + // They just disappeared while we'd released the lock. + return false + } c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString()) return false } @@ -2650,14 +2655,12 @@ func (c *Conn) CreateBind(uint16) (conn.Bind, uint16, error) { // func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, error) { + c.mu.Lock() + defer c.mu.Unlock() + pk := key.Public(pubKey) c.logf("magicsock: CreateEndpoint: key=%s: %s", pk.ShortString(), derpStr(addrs)) - if !c.canCreateEPUnlocked.Get() { // sorry - c.mu.Lock() - defer c.mu.Unlock() - } - if strings.HasSuffix(addrs, controlclient.EndpointDiscoSuffix) { discoHex := strings.TrimSuffix(addrs, controlclient.EndpointDiscoSuffix) discoKey, err := key.NewPublicFromHexMem(mem.S(discoHex))