control/controlclient: improve handling of concurrent lite map requests

Prior to this change, if we were in the middle of a lite map update we'd
tear down the entire map session and restart it. With this change, we'll
cancel an in-flight lite map request up to 10 times and restart before
we tear down the streaming map request. We tear down everything after 10
retries to ensure that a steady stream of calls to sendNewMapRequest
doesn't fail to make progress by repeatedly canceling and restarting.

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Co-authored-by: Maisem Ali <maisem@tailscale.com>
Change-Id: I9392bf8cf674e7a58ccd1e476039300a359ef3b1
This commit is contained in:
Andrew Dunham 2023-03-07 16:54:11 -05:00
parent b0cb39cda1
commit 48f6c1eba4
1 changed files with 55 additions and 17 deletions

View File

@ -59,15 +59,17 @@ type Auto struct {
mu sync.Mutex // mutex guards the following fields
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
inSendStatus int // number of sendStatus calls currently in progress
state State
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
liteMapUpdateCancel func() // cancels a lite map update
liteMapUpdateCancels int // how many times we've canceled a lite map update
inSendStatus int // number of sendStatus calls currently in progress
state State
authCtx context.Context // context used for auth requests
mapCtx context.Context // context used for netmap requests
@ -168,28 +170,55 @@ func (c *Auto) Start() {
func (c *Auto) sendNewMapRequest() {
c.mu.Lock()
// If we're not already streaming a netmap, or if we're already stuck
// in a lite update, then tear down everything and start a new stream
// (which starts by sending a new map request)
if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn {
// If we're not already streaming a netmap, then tear down everything
// and start a new stream (which starts by sending a new map request)
if !c.inPollNetMap || !c.loggedIn {
c.mu.Unlock()
c.cancelMapSafely()
return
}
// If we are already in process of doing a LiteMapUpdate, cancel it and
// try a new one. If this is the 10th time we have done this
// cancelation, tear down everything and start again
if c.inLiteMapUpdate {
// Always cancel the in-flight lite map update, regardless of
// whether we cancel the streaming map request or not.
c.liteMapUpdateCancel()
c.inLiteMapUpdate = false
if c.liteMapUpdateCancels > 10 {
// Not making progress
c.mu.Unlock()
c.cancelMapSafely()
return
}
// Increment our cancel counter and continue below to start a
// new lite update.
c.liteMapUpdateCancels++
}
// Otherwise, send a lite update that doesn't keep a
// long-running stream response.
defer c.mu.Unlock()
c.inLiteMapUpdate = true
ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
c.liteMapUpdateCancel = cancel
go func() {
defer cancel()
t0 := time.Now()
err := c.direct.SendLiteMapUpdate(ctx)
d := time.Since(t0).Round(time.Millisecond)
c.mu.Lock()
c.inLiteMapUpdate = false
c.liteMapUpdateCancel = nil
if err == nil {
c.liteMapUpdateCancels = 0
}
c.mu.Unlock()
if err == nil {
c.logf("[v1] successful lite map update in %v", d)
return
@ -197,10 +226,13 @@ func (c *Auto) sendNewMapRequest() {
if ctx.Err() == nil {
c.logf("lite map update after %v: %v", d, err)
}
// Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update
// failed for any reason.
c.cancelMapSafely()
if !errors.Is(ctx.Err(), context.Canceled) {
// Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update
// failed for reasons other than the context being
// canceled.
c.cancelMapSafely()
}
}()
}
@ -237,6 +269,12 @@ func (c *Auto) cancelMapSafely() {
c.mu.Lock()
defer c.mu.Unlock()
// Always reset our lite map cancels counter if we're canceling
// everything, since we're about to restart with a new map update; this
// allows future calls to sendNewMapRequest to retry sending lite
// updates.
c.liteMapUpdateCancels = 0
c.logf("[v1] cancelMapSafely: synced=%v", c.synced)
if c.inPollNetMap {