tstest/integration: add more debugging, logs to catch flaky test

Updates #11962 Change-Id: I1ab0db69bdf8d1d535aa2cef434c586311f0fe18 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2024-05-06 14:08:28 -07:00 · 2024-05-06 14:08:28 -07:00 · 8bf773c06c
parent ce8969d82b
commit 8bf773c06c
2 changed files with 48 additions and 12 deletions
--- a/control/controlclient/direct.go
+++ b/control/controlclient/direct.go
@ -398,6 +398,9 @@ func (c *Direct) TryLogout(ctx context.Context) error {
 }

 func (c *Direct) TryLogin(ctx context.Context, t *tailcfg.Oauth2Token, flags LoginFlags) (url string, err error) {
+	if strings.Contains(c.serverURL, "controlplane.tailscale.com") && envknob.Bool("TS_PANIC_IF_HIT_MAIN_CONTROL") {
+		panic("[unexpected] controlclient: TryLogin called on " + c.serverURL)
+	}
 	c.logf("[v1] direct.TryLogin(token=%v, flags=%v)", t != nil, flags)
 	return c.doLoginOrRegen(ctx, loginOpt{Token: t, Flags: flags})
 }
--- a/tstest/integration/integration_test.go
+++ b/tstest/integration/integration_test.go
@ -357,6 +357,32 @@ func TestTwoNodes(t *testing.T) {
 	n2SocksAddrCh := n2.socks5AddrChan()
 	d2 := n2.StartDaemon()

+	// Drop some logs to disk on test failure.
+	//
+	// TODO(bradfitz): make all nodes for all tests do this? give each node a
+	// unique integer within the test? But for now only do this test because
+	// this is what we often saw flaking.
+	t.Cleanup(func() {
+		if !t.Failed() {
+			return
+		}
+		n1.mu.Lock()
+		n2.mu.Lock()
+		defer n1.mu.Unlock()
+		defer n2.mu.Unlock()
+
+		rxNoDates := regexp.MustCompile(`(?m)^\d{4}.\d{2}.\d{2}.\d{2}:\d{2}:\d{2}`)
+		cleanLog := func(n *testNode) []byte {
+			b := n.tailscaledParser.allBuf.Bytes()
+			b = rxNoDates.ReplaceAll(b, nil)
+			return b
+		}
+
+		t.Logf("writing tailscaled logs to n1.log and n2.log")
+		os.WriteFile("n1.log", cleanLog(n1), 0666)
+		os.WriteFile("n2.log", cleanLog(n2), 0666)
+	})
+
 	n1Socks := n1.AwaitSocksAddr(n1SocksAddrCh)
 	n2Socks := n1.AwaitSocksAddr(n2SocksAddrCh)
 	t.Logf("node1 SOCKS5 addr: %v", n1Socks)
@ -1047,7 +1073,8 @@ func newTestEnv(t testing.TB, opts ...testEnvOpt) *testEnv {
 // Currently, the test is simplistic and user==node==machine.
 // That may grow complexity later to test more.
 type testNode struct {
-	env *testEnv
+	env              *testEnv
+	tailscaledParser *nodeOutputParser

 	dir        string // temp dir for sock & state
 	configFile string // or empty for none
@ -1178,19 +1205,25 @@ func (n *testNode) AwaitSocksAddr(ch <-chan string) string {
 // per-line callbacks previously registered via
 // testNode.addLogLineHook.
 type nodeOutputParser struct {
-	buf bytes.Buffer
-	n   *testNode
+	allBuf      bytes.Buffer
+	pendLineBuf bytes.Buffer
+	n           *testNode
 }

 func (op *nodeOutputParser) Write(p []byte) (n int, err error) {
-	n, err = op.buf.Write(p)
-	op.parseLines()
+	tn := op.n
+	tn.mu.Lock()
+	defer tn.mu.Unlock()
+
+	op.allBuf.Write(p)
+	n, err = op.pendLineBuf.Write(p)
+	op.parseLinesLocked()
 	return
 }

-func (op *nodeOutputParser) parseLines() {
+func (op *nodeOutputParser) parseLinesLocked() {
 	n := op.n
-	buf := op.buf.Bytes()
+	buf := op.pendLineBuf.Bytes()
 	for len(buf) > 0 {
 		nl := bytes.IndexByte(buf, '\n')
 		if nl == -1 {
@ -1199,16 +1232,14 @@ func (op *nodeOutputParser) parseLines() {
 		line := buf[:nl+1]
 		buf = buf[nl+1:]

-		n.mu.Lock()
 		for _, f := range n.onLogLine {
 			f(line)
 		}
-		n.mu.Unlock()
 	}
 	if len(buf) == 0 {
-		op.buf.Reset()
+		op.pendLineBuf.Reset()
 	} else {
-		io.CopyN(io.Discard, &op.buf, int64(op.buf.Len()-len(buf)))
+		io.CopyN(io.Discard, &op.pendLineBuf, int64(op.pendLineBuf.Len()-len(buf)))
 	}
 }

@ -1263,11 +1294,13 @@ func (n *testNode) StartDaemonAsIPNGOOS(ipnGOOS string) *Daemon {
 		"TS_ASSUME_NETWORK_UP_FOR_TEST=1", // don't pause control client in airplane mode (no wifi, etc)
 		"TS_PANIC_IF_HIT_MAIN_CONTROL=1",
 		"TS_DISABLE_PORTMAPPER=1", // shouldn't be needed; test is all localhost
+		"TS_DEBUG_LOG_RATE=all",
 	)
 	if version.IsRace() {
 		cmd.Env = append(cmd.Env, "GORACE=halt_on_error=1")
 	}
-	cmd.Stderr = &nodeOutputParser{n: n}
+	n.tailscaledParser = &nodeOutputParser{n: n}
+	cmd.Stderr = n.tailscaledParser
 	if *verboseTailscaled {
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = io.MultiWriter(cmd.Stderr, os.Stderr)