prober: add an HTTP endpoint for triggering a probe

- Keep track of the last 10 probe results and successful probe
  latencies;
- Add an HTTP handler that triggers a given probe by name and returns it
  result as a plaintext HTML page, showing recent probe results as a
  baseline

Updates tailscale/corp#20583

Signed-off-by: Anton Tolchanov <anton@tailscale.com>
This commit is contained in:
Anton Tolchanov 2024-07-30 23:06:39 +01:00 committed by Anton Tolchanov
parent 227509547f
commit 153a476957
2 changed files with 311 additions and 40 deletions

View File

@ -7,19 +7,25 @@
package prober package prober
import ( import (
"container/ring"
"context" "context"
"errors"
"fmt" "fmt"
"hash/fnv" "hash/fnv"
"log" "log"
"maps" "maps"
"math/rand" "math/rand"
"net/http"
"sync" "sync"
"time" "time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"tailscale.com/tsweb"
) )
// recentHistSize is the number of recent probe results and latencies to keep
// in memory.
const recentHistSize = 10
// ProbeClass defines a probe of a specific type: a probing function that will // ProbeClass defines a probe of a specific type: a probing function that will
// be regularly ran, and metric labels that will be added automatically to all // be regularly ran, and metric labels that will be added automatically to all
// probes using this class. // probes using this class.
@ -106,6 +112,14 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
l[k] = v l[k] = v
} }
probe := newProbe(p, name, interval, l, pc)
p.probes[name] = probe
go probe.loop()
return probe
}
// newProbe creates a new Probe with the given parameters, but does not start it.
func newProbe(p *Prober, name string, interval time.Duration, l prometheus.Labels, pc ProbeClass) *Probe {
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
probe := &Probe{ probe := &Probe{
prober: p, prober: p,
@ -117,6 +131,9 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
probeClass: pc, probeClass: pc,
interval: interval, interval: interval,
initialDelay: initialDelay(name, interval), initialDelay: initialDelay(name, interval),
successHist: ring.New(recentHistSize),
latencyHist: ring.New(recentHistSize),
metrics: prometheus.NewRegistry(), metrics: prometheus.NewRegistry(),
metricLabels: l, metricLabels: l,
mInterval: prometheus.NewDesc("interval_secs", "Probe interval in seconds", nil, l), mInterval: prometheus.NewDesc("interval_secs", "Probe interval in seconds", nil, l),
@ -131,15 +148,14 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
Name: "seconds_total", Help: "Total amount of time spent executing the probe", ConstLabels: l, Name: "seconds_total", Help: "Total amount of time spent executing the probe", ConstLabels: l,
}, []string{"status"}), }, []string{"status"}),
} }
if p.metrics != nil {
prometheus.WrapRegistererWithPrefix(p.namespace+"_", p.metrics).MustRegister(probe.metrics) prometheus.WrapRegistererWithPrefix(p.namespace+"_", p.metrics).MustRegister(probe.metrics)
}
probe.metrics.MustRegister(probe) probe.metrics.MustRegister(probe)
p.probes[name] = probe
go probe.loop()
return probe return probe
} }
// unregister removes a probe from the prober's internal state.
func (p *Prober) unregister(probe *Probe) { func (p *Prober) unregister(probe *Probe) {
p.mu.Lock() p.mu.Lock()
defer p.mu.Unlock() defer p.mu.Unlock()
@ -206,6 +222,7 @@ type Probe struct {
ctx context.Context ctx context.Context
cancel context.CancelFunc // run to initiate shutdown cancel context.CancelFunc // run to initiate shutdown
stopped chan struct{} // closed when shutdown is complete stopped chan struct{} // closed when shutdown is complete
runMu sync.Mutex // ensures only one probe runs at a time
name string name string
probeClass ProbeClass probeClass ProbeClass
@ -232,6 +249,10 @@ type Probe struct {
latency time.Duration // last successful probe latency latency time.Duration // last successful probe latency
succeeded bool // whether the last doProbe call succeeded succeeded bool // whether the last doProbe call succeeded
lastErr error lastErr error
// History of recent probe results and latencies.
successHist *ring.Ring
latencyHist *ring.Ring
} }
// Close shuts down the Probe and unregisters it from its Prober. // Close shuts down the Probe and unregisters it from its Prober.
@ -278,13 +299,17 @@ func (p *Probe) loop() {
} }
} }
// run invokes fun and records the results. // run invokes the probe function and records the result. It returns the probe
// result and an error if the probe failed.
// //
// fun is invoked with a timeout slightly less than interval, so that // The probe function is invoked with a timeout slightly less than interval, so
// the probe either succeeds or fails before the next cycle is // that the probe either succeeds or fails before the next cycle is scheduled to
// scheduled to start. // start.
func (p *Probe) run() { func (p *Probe) run() (pi ProbeInfo, err error) {
start := p.recordStart() p.runMu.Lock()
defer p.runMu.Unlock()
p.recordStart()
defer func() { defer func() {
// Prevent a panic within one probe function from killing the // Prevent a panic within one probe function from killing the
// entire prober, so that a single buggy probe doesn't destroy // entire prober, so that a single buggy probe doesn't destroy
@ -293,29 +318,30 @@ func (p *Probe) run() {
// alert for debugging. // alert for debugging.
if r := recover(); r != nil { if r := recover(); r != nil {
log.Printf("probe %s panicked: %v", p.name, r) log.Printf("probe %s panicked: %v", p.name, r)
p.recordEnd(start, errors.New("panic")) err = fmt.Errorf("panic: %v", r)
p.recordEnd(err)
} }
}() }()
timeout := time.Duration(float64(p.interval) * 0.8) timeout := time.Duration(float64(p.interval) * 0.8)
ctx, cancel := context.WithTimeout(p.ctx, timeout) ctx, cancel := context.WithTimeout(p.ctx, timeout)
defer cancel() defer cancel()
err := p.probeClass.Probe(ctx) err = p.probeClass.Probe(ctx)
p.recordEnd(start, err) p.recordEnd(err)
if err != nil { if err != nil {
log.Printf("probe %s: %v", p.name, err) log.Printf("probe %s: %v", p.name, err)
} }
pi = p.probeInfoLocked()
return
} }
func (p *Probe) recordStart() time.Time { func (p *Probe) recordStart() {
st := p.prober.now()
p.mu.Lock() p.mu.Lock()
defer p.mu.Unlock() p.start = p.prober.now()
p.start = st p.mu.Unlock()
return st
} }
func (p *Probe) recordEnd(start time.Time, err error) { func (p *Probe) recordEnd(err error) {
end := p.prober.now() end := p.prober.now()
p.mu.Lock() p.mu.Lock()
defer p.mu.Unlock() defer p.mu.Unlock()
@ -327,22 +353,55 @@ func (p *Probe) recordEnd(start time.Time, err error) {
p.latency = latency p.latency = latency
p.mAttempts.WithLabelValues("ok").Inc() p.mAttempts.WithLabelValues("ok").Inc()
p.mSeconds.WithLabelValues("ok").Add(latency.Seconds()) p.mSeconds.WithLabelValues("ok").Add(latency.Seconds())
p.latencyHist.Value = latency
p.latencyHist = p.latencyHist.Next()
} else { } else {
p.latency = 0 p.latency = 0
p.mAttempts.WithLabelValues("fail").Inc() p.mAttempts.WithLabelValues("fail").Inc()
p.mSeconds.WithLabelValues("fail").Add(latency.Seconds()) p.mSeconds.WithLabelValues("fail").Add(latency.Seconds())
} }
p.successHist.Value = p.succeeded
p.successHist = p.successHist.Next()
} }
// ProbeInfo is the state of a Probe. // ProbeInfo is a snapshot of the configuration and state of a Probe.
type ProbeInfo struct { type ProbeInfo struct {
Name string
Class string
Interval time.Duration
Labels map[string]string
Start time.Time Start time.Time
End time.Time End time.Time
Latency string Latency time.Duration
Result bool Result bool
Error string Error string
RecentResults []bool
RecentLatencies []time.Duration
} }
// RecentSuccessRatio returns the success ratio of the probe in the recent history.
func (pb ProbeInfo) RecentSuccessRatio() float64 {
if len(pb.RecentResults) == 0 {
return 0
}
var sum int
for _, r := range pb.RecentResults {
if r {
sum++
}
}
return float64(sum) / float64(len(pb.RecentResults))
}
// RecentMedianLatency returns the median latency of the probe in the recent history.
func (pb ProbeInfo) RecentMedianLatency() time.Duration {
if len(pb.RecentLatencies) == 0 {
return 0
}
return pb.RecentLatencies[len(pb.RecentLatencies)/2]
}
// ProbeInfo returns the state of all probes.
func (p *Prober) ProbeInfo() map[string]ProbeInfo { func (p *Prober) ProbeInfo() map[string]ProbeInfo {
out := map[string]ProbeInfo{} out := map[string]ProbeInfo{}
@ -352,10 +411,21 @@ func (p *Prober) ProbeInfo() map[string]ProbeInfo {
probes = append(probes, probe) probes = append(probes, probe)
} }
p.mu.Unlock() p.mu.Unlock()
for _, probe := range probes { for _, probe := range probes {
probe.mu.Lock() probe.mu.Lock()
out[probe.name] = probe.probeInfoLocked()
probe.mu.Unlock()
}
return out
}
// probeInfoLocked returns the state of the probe.
func (probe *Probe) probeInfoLocked() ProbeInfo {
inf := ProbeInfo{ inf := ProbeInfo{
Name: probe.name,
Class: probe.probeClass.Class,
Interval: probe.interval,
Labels: probe.metricLabels,
Start: probe.start, Start: probe.start,
End: probe.end, End: probe.end,
Result: probe.succeeded, Result: probe.succeeded,
@ -364,12 +434,44 @@ func (p *Prober) ProbeInfo() map[string]ProbeInfo {
inf.Error = probe.lastErr.Error() inf.Error = probe.lastErr.Error()
} }
if probe.latency > 0 { if probe.latency > 0 {
inf.Latency = probe.latency.String() inf.Latency = probe.latency
} }
out[probe.name] = inf probe.latencyHist.Do(func(v any) {
probe.mu.Unlock() if l, ok := v.(time.Duration); ok {
inf.RecentLatencies = append(inf.RecentLatencies, l)
} }
return out })
probe.successHist.Do(func(v any) {
if r, ok := v.(bool); ok {
inf.RecentResults = append(inf.RecentResults, r)
}
})
return inf
}
// RunHandler runs a probe by name and returns the result as an HTTP response.
func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
// Look up prober by name.
name := r.FormValue("name")
if name == "" {
return tsweb.Error(http.StatusBadRequest, "missing name parameter", nil)
}
p.mu.Lock()
probe, ok := p.probes[name]
prevInfo := probe.probeInfoLocked()
p.mu.Unlock()
if !ok {
return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
}
info, err := probe.run()
stats := fmt.Sprintf("Previous runs: success rate %d%%, median latency %v",
int(prevInfo.RecentSuccessRatio()*100), prevInfo.RecentMedianLatency())
if err != nil {
return tsweb.Error(http.StatusFailedDependency, fmt.Sprintf("Probe failed: %s\n%s", err.Error(), stats), err)
}
w.WriteHeader(respStatus)
w.Write([]byte(fmt.Sprintf("Probe succeeded in %v\n%s", info.Latency, stats)))
return nil
} }
// Describe implements prometheus.Collector. // Describe implements prometheus.Collector.

View File

@ -13,6 +13,8 @@ import (
"testing" "testing"
"time" "time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/client_golang/prometheus/testutil"
"tailscale.com/tstest" "tailscale.com/tstest"
) )
@ -292,6 +294,173 @@ func TestOnceMode(t *testing.T) {
} }
} }
func TestProberProbeInfo(t *testing.T) {
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
p.Run("probe1", probeInterval, nil, FuncProbe(func(context.Context) error {
clk.Advance(500 * time.Millisecond)
return nil
}))
p.Run("probe2", probeInterval, nil, FuncProbe(func(context.Context) error { return fmt.Errorf("error2") }))
p.Wait()
info := p.ProbeInfo()
wantInfo := map[string]ProbeInfo{
"probe1": {
Name: "probe1",
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe1"},
Latency: 500 * time.Millisecond,
Result: true,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{500 * time.Millisecond},
},
"probe2": {
Name: "probe2",
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe2"},
Error: "error2",
RecentResults: []bool{false},
RecentLatencies: nil, // no latency for failed probes
},
}
if diff := cmp.Diff(wantInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End")); diff != "" {
t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
}
}
func TestProbeInfoRecent(t *testing.T) {
type probeResult struct {
latency time.Duration
err error
}
tests := []struct {
name string
results []probeResult
wantProbeInfo ProbeInfo
wantRecentSuccessRatio float64
wantRecentMedianLatency time.Duration
}{
{
name: "no_runs",
wantProbeInfo: ProbeInfo{},
wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0,
},
{
name: "single_success",
results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
wantProbeInfo: ProbeInfo{
Latency: 100 * time.Millisecond,
Result: true,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{100 * time.Millisecond},
},
wantRecentSuccessRatio: 1,
wantRecentMedianLatency: 100 * time.Millisecond,
},
{
name: "single_failure",
results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
wantProbeInfo: ProbeInfo{
Result: false,
RecentResults: []bool{false},
RecentLatencies: nil,
Error: "error123",
},
wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0,
},
{
name: "recent_mix",
results: []probeResult{
{latency: 10 * time.Millisecond, err: errors.New("error1")},
{latency: 20 * time.Millisecond, err: nil},
{latency: 30 * time.Millisecond, err: nil},
{latency: 40 * time.Millisecond, err: errors.New("error4")},
{latency: 50 * time.Millisecond, err: nil},
{latency: 60 * time.Millisecond, err: nil},
{latency: 70 * time.Millisecond, err: errors.New("error7")},
{latency: 80 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Latency: 80 * time.Millisecond,
RecentResults: []bool{false, true, true, false, true, true, false, true},
RecentLatencies: []time.Duration{
20 * time.Millisecond,
30 * time.Millisecond,
50 * time.Millisecond,
60 * time.Millisecond,
80 * time.Millisecond,
},
},
wantRecentSuccessRatio: 0.625,
wantRecentMedianLatency: 50 * time.Millisecond,
},
{
name: "only_last_10",
results: []probeResult{
{latency: 10 * time.Millisecond, err: errors.New("old_error")},
{latency: 20 * time.Millisecond, err: nil},
{latency: 30 * time.Millisecond, err: nil},
{latency: 40 * time.Millisecond, err: nil},
{latency: 50 * time.Millisecond, err: nil},
{latency: 60 * time.Millisecond, err: nil},
{latency: 70 * time.Millisecond, err: nil},
{latency: 80 * time.Millisecond, err: nil},
{latency: 90 * time.Millisecond, err: nil},
{latency: 100 * time.Millisecond, err: nil},
{latency: 110 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Latency: 110 * time.Millisecond,
RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
RecentLatencies: []time.Duration{
20 * time.Millisecond,
30 * time.Millisecond,
40 * time.Millisecond,
50 * time.Millisecond,
60 * time.Millisecond,
70 * time.Millisecond,
80 * time.Millisecond,
90 * time.Millisecond,
100 * time.Millisecond,
110 * time.Millisecond,
},
},
wantRecentSuccessRatio: 1,
wantRecentMedianLatency: 70 * time.Millisecond,
},
}
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
probe := newProbe(p, "", probeInterval, nil, FuncProbe(func(context.Context) error { return nil }))
for _, r := range tt.results {
probe.recordStart()
clk.Advance(r.latency)
probe.recordEnd(r.err)
}
info := probe.probeInfoLocked()
if diff := cmp.Diff(tt.wantProbeInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End", "Interval")); diff != "" {
t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
}
if got := info.RecentSuccessRatio(); got != tt.wantRecentSuccessRatio {
t.Errorf("recentSuccessRatio() = %v, want %v", got, tt.wantRecentSuccessRatio)
}
if got := info.RecentMedianLatency(); got != tt.wantRecentMedianLatency {
t.Errorf("recentMedianLatency() = %v, want %v", got, tt.wantRecentMedianLatency)
}
})
}
}
type fakeTicker struct { type fakeTicker struct {
ch chan time.Time ch chan time.Time
interval time.Duration interval time.Duration