ipn/ipnlocal: add advertised and primary route metrics

Updates tailscale/corp#22075

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2024-09-25 16:50:34 +02:00 committed by Kristoffer Dalby
parent cab2e6ea67
commit 77832553e5
3 changed files with 91 additions and 1 deletions

View File

@ -390,9 +390,18 @@ type updateStatus struct {
}
type metrics struct {
// advertisedRoutes is a metric that counts the number of network routes that are advertised by the local node.
// advertisedRoutes is a metric that reports the number of network routes that are advertised by the local node.
// This informs the user of how many routes are being advertised by the local node, excluding exit routes.
advertisedRoutes *usermetric.Gauge
// approvedRoutes is a metric that reports the number of network routes served by the local node and approved
// by the control server.
approvedRoutes *usermetric.Gauge
// primaryRoutes is a metric that reports the number of primary network routes served by the local node.
// A route being a primary route implies that the route is currently served by this node, and not by another
// subnet router in a high availability configuration.
primaryRoutes *usermetric.Gauge
}
// clientGen is a func that creates a control plane client.
@ -441,6 +450,10 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
m := metrics{
advertisedRoutes: sys.UserMetricsRegistry().NewGauge(
"tailscaled_advertised_routes", "Number of advertised network routes (e.g. by a subnet router)"),
approvedRoutes: sys.UserMetricsRegistry().NewGauge(
"tailscaled_approved_routes", "Number of approved network routes (e.g. by a subnet router)"),
primaryRoutes: sys.UserMetricsRegistry().NewGauge(
"tailscaled_primary_routes", "Number of network routes for which this node is a primary router (in high availability configuration)"),
}
b := &LocalBackend{
@ -5388,6 +5401,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
b.setTCPPortsInterceptedFromNetmapAndPrefsLocked(b.pm.CurrentPrefs())
if nm == nil {
b.nodeByAddr = nil
// If there is no netmap, the client is going into a "turned off"
// state so reset the metrics.
b.metrics.approvedRoutes.Set(0)
b.metrics.primaryRoutes.Set(0)
return
}
@ -5408,6 +5426,15 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
}
if nm.SelfNode.Valid() {
addNode(nm.SelfNode)
var approved float64
for _, route := range nm.SelfNode.AllowedIPs().All() {
if !views.SliceContains(nm.SelfNode.Addresses(), route) && !tsaddr.IsExitRoute(route) {
approved++
}
}
b.metrics.approvedRoutes.Set(approved)
b.metrics.primaryRoutes.Set(float64(tsaddr.WithoutExitRoute(nm.SelfNode.PrimaryRoutes()).Len()))
}
for _, p := range nm.Peers {
addNode(p)

View File

@ -26,6 +26,7 @@ import (
"os"
"path/filepath"
"reflect"
"runtime"
"strings"
"sync"
"sync/atomic"
@ -924,6 +925,32 @@ func TestUserMetrics(t *testing.T) {
s1.lb.DebugForceNetmapUpdate()
s2.lb.DebugForceNetmapUpdate()
wantRoutes := float64(2)
if runtime.GOOS == "windows" {
wantRoutes = 0
}
// Wait for the routes to be propagated to node 1 to ensure
// that the metrics are up-to-date.
waitForCondition(t, "primary routes available for node1", 90*time.Second, func() bool {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
status1, err := lc1.Status(ctx)
if err != nil {
t.Logf("getting status: %s", err)
return false
}
if runtime.GOOS == "windows" {
// Windows does not seem to support or report back routes when running in
// userspace via tsnet. So, we skip this check on Windows.
// TODO(kradalby): Figure out if this is correct.
return true
}
// Wait for the primary routes to reach our desired routes, which is wantRoutes + 1, because
// the PrimaryRoutes list will contain a exit node route, which the metric does not count.
return status1.Self.PrimaryRoutes != nil && status1.Self.PrimaryRoutes.Len() == int(wantRoutes)+1
})
ctxLc, cancelLc := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelLc()
metrics1, err := lc1.UserMetrics(ctxLc)
@ -951,11 +978,25 @@ func TestUserMetrics(t *testing.T) {
t.Errorf("metrics1, tailscaled_advertised_routes: got %v, want %v", got, want)
}
// The control has approved 2 routes:
// - 192.0.2.0/24
// - 192.0.5.1/32
if got, want := parsedMetrics1["tailscaled_approved_routes"], wantRoutes; got != want {
t.Errorf("metrics1, tailscaled_approved_routes: got %v, want %v", got, want)
}
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics1[`tailscaled_health_messages{type="warning"}`], float64(len(status1.Health)); got != want {
t.Errorf("metrics1, tailscaled_health_messages: got %v, want %v", got, want)
}
// The node is the primary subnet router for 2 routes:
// - 192.0.2.0/24
// - 192.0.5.1/32
if got, want := parsedMetrics1["tailscaled_primary_routes"], wantRoutes; got != want {
t.Errorf("metrics1, tailscaled_primary_routes: got %v, want %v", got, want)
}
metrics2, err := lc2.UserMetrics(ctx)
if err != nil {
t.Fatal(err)
@ -978,8 +1019,28 @@ func TestUserMetrics(t *testing.T) {
t.Errorf("metrics2, tailscaled_advertised_routes: got %v, want %v", got, want)
}
// The control has approved 0 routes
if got, want := parsedMetrics2["tailscaled_approved_routes"], 0.0; got != want {
t.Errorf("metrics2, tailscaled_approved_routes: got %v, want %v", got, want)
}
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics2[`tailscaled_health_messages{type="warning"}`], float64(len(status2.Health)); got != want {
t.Errorf("metrics2, tailscaled_health_messages: got %v, want %v", got, want)
}
// The node is the primary subnet router for 0 routes
if got, want := parsedMetrics2["tailscaled_primary_routes"], 0.0; got != want {
t.Errorf("metrics2, tailscaled_primary_routes: got %v, want %v", got, want)
}
}
func waitForCondition(t *testing.T, msg string, waitTime time.Duration, f func() bool) {
t.Helper()
for deadline := time.Now().Add(waitTime); time.Now().Before(deadline); time.Sleep(1 * time.Second) {
if f() {
return
}
}
t.Fatalf("waiting for condition: %s", msg)
}

View File

@ -366,6 +366,7 @@ func (s *Server) serveMachine(w http.ResponseWriter, r *http.Request) {
func (s *Server) SetSubnetRoutes(nodeKey key.NodePublic, routes []netip.Prefix) {
s.mu.Lock()
defer s.mu.Unlock()
s.logf("Setting subnet routes for %s: %v", nodeKey.ShortString(), routes)
mak.Set(&s.nodeSubnetRoutes, nodeKey, routes)
}
@ -1018,6 +1019,7 @@ func (s *Server) MapResponse(req *tailcfg.MapRequest) (res *tailcfg.MapResponse,
s.mu.Lock()
defer s.mu.Unlock()
res.Node.PrimaryRoutes = s.nodeSubnetRoutes[nk]
res.Node.AllowedIPs = append(res.Node.Addresses, s.nodeSubnetRoutes[nk]...)
// Consume a PingRequest while protected by mutex if it exists