594 lines
16 KiB
Go
594 lines
16 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
// Package monitor provides facilities for monitoring network
|
|
// interface and route changes. It primarily exists to know when
|
|
// portable devices move between different networks.
|
|
package netmon
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"net/netip"
|
|
"runtime"
|
|
"sync"
|
|
"time"
|
|
|
|
"tailscale.com/net/interfaces"
|
|
"tailscale.com/types/logger"
|
|
"tailscale.com/util/clientmetric"
|
|
"tailscale.com/util/set"
|
|
)
|
|
|
|
// pollWallTimeInterval is how often we check the time to check
|
|
// for big jumps in wall (non-monotonic) time as a backup mechanism
|
|
// to get notified of a sleeping device waking back up.
|
|
// Usually there are also minor network change events on wake that let
|
|
// us check the wall time sooner than this.
|
|
const pollWallTimeInterval = 15 * time.Second
|
|
|
|
// message represents a message returned from an osMon.
|
|
type message interface {
|
|
// Ignore is whether we should ignore this message.
|
|
ignore() bool
|
|
}
|
|
|
|
// osMon is the interface that each operating system-specific
|
|
// implementation of the link monitor must implement.
|
|
type osMon interface {
|
|
Close() error
|
|
|
|
// Receive returns a new network interface change message. It
|
|
// should block until there's either something to return, or
|
|
// until the osMon is closed. After a Close, the returned
|
|
// error is ignored.
|
|
Receive() (message, error)
|
|
|
|
// IsInterestingInterface reports whether the provided interface should
|
|
// be considered for network change events.
|
|
IsInterestingInterface(iface string) bool
|
|
}
|
|
|
|
// Monitor represents a monitoring instance.
|
|
type Monitor struct {
|
|
logf logger.Logf
|
|
om osMon // nil means not supported on this platform
|
|
change chan bool // send false to wake poller, true to also force ChangeDeltas be sent
|
|
stop chan struct{} // closed on Stop
|
|
|
|
// Things that must be set early, before use,
|
|
// and not change at runtime.
|
|
tsIfName string // tailscale interface name, if known/set ("tailscale0", "utun3", ...)
|
|
|
|
mu sync.Mutex // guards all following fields
|
|
cbs set.HandleSet[ChangeFunc]
|
|
ruleDelCB set.HandleSet[RuleDeleteCallback]
|
|
ifState *interfaces.State
|
|
gwValid bool // whether gw and gwSelfIP are valid
|
|
gw netip.Addr // our gateway's IP
|
|
gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
|
|
started bool
|
|
closed bool
|
|
goroutines sync.WaitGroup
|
|
wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
|
|
lastWall time.Time
|
|
timeJumped bool // whether we need to send a changed=true after a big time jump
|
|
}
|
|
|
|
// ChangeFunc is a callback function registered with Monitor that's called when the
|
|
// network changed.
|
|
type ChangeFunc func(*ChangeDelta)
|
|
|
|
// ChangeDelta describes the difference between two network states.
|
|
type ChangeDelta struct {
|
|
// Monitor is the network monitor that sent this delta.
|
|
Monitor *Monitor
|
|
|
|
// Old is the old interface state, if known.
|
|
// It's nil if the old state is unknown.
|
|
// Do not mutate it.
|
|
Old *interfaces.State
|
|
|
|
// New is the new network state.
|
|
// It is always non-nil.
|
|
// Do not mutate it.
|
|
New *interfaces.State
|
|
|
|
// Major is our legacy boolean of whether the network changed in some major
|
|
// way.
|
|
//
|
|
// Deprecated: do not remove. As of 2023-08-23 we're in a renewed effort to
|
|
// remove it and ask specific qustions of ChangeDelta instead. Look at Old
|
|
// and New (or add methods to ChangeDelta) instead of using Major.
|
|
Major bool
|
|
|
|
// TimeJumped is whether there was a big jump in wall time since the last
|
|
// time we checked. This is a hint that a mobile sleeping device might have
|
|
// come out of sleep.
|
|
TimeJumped bool
|
|
|
|
// TODO(bradfitz): add some lazy cached fields here as needed with methods
|
|
// on *ChangeDelta to let callers ask specific questions
|
|
}
|
|
|
|
// New instantiates and starts a monitoring instance.
|
|
// The returned monitor is inactive until it's started by the Start method.
|
|
// Use RegisterChangeCallback to get notified of network changes.
|
|
func New(logf logger.Logf) (*Monitor, error) {
|
|
logf = logger.WithPrefix(logf, "monitor: ")
|
|
m := &Monitor{
|
|
logf: logf,
|
|
change: make(chan bool, 1),
|
|
stop: make(chan struct{}),
|
|
lastWall: wallTime(),
|
|
}
|
|
st, err := m.interfaceStateUncached()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
m.ifState = st
|
|
|
|
m.om, err = newOSMon(logf, m)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if m.om == nil {
|
|
return nil, errors.New("newOSMon returned nil, nil")
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// InterfaceState returns the latest snapshot of the machine's network
|
|
// interfaces.
|
|
//
|
|
// The returned value is owned by Mon; it must not be modified.
|
|
func (m *Monitor) InterfaceState() *interfaces.State {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
return m.ifState
|
|
}
|
|
|
|
func (m *Monitor) interfaceStateUncached() (*interfaces.State, error) {
|
|
return interfaces.GetState()
|
|
}
|
|
|
|
// SetTailscaleInterfaceName sets the name of the Tailscale interface. For
|
|
// example, "tailscale0", "tun0", "utun3", etc.
|
|
//
|
|
// This must be called only early in tailscaled startup before the monitor is
|
|
// used.
|
|
func (m *Monitor) SetTailscaleInterfaceName(ifName string) {
|
|
m.tsIfName = ifName
|
|
}
|
|
|
|
// GatewayAndSelfIP returns the current network's default gateway, and
|
|
// the machine's default IP for that gateway.
|
|
//
|
|
// It's the same as interfaces.LikelyHomeRouterIP, but it caches the
|
|
// result until the monitor detects a network change.
|
|
func (m *Monitor) GatewayAndSelfIP() (gw, myIP netip.Addr, ok bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
if m.gwValid {
|
|
return m.gw, m.gwSelfIP, true
|
|
}
|
|
gw, myIP, ok = interfaces.LikelyHomeRouterIP()
|
|
changed := false
|
|
if ok {
|
|
changed = m.gw != gw || m.gwSelfIP != myIP
|
|
m.gw, m.gwSelfIP = gw, myIP
|
|
m.gwValid = true
|
|
}
|
|
if changed {
|
|
m.logf("gateway and self IP changed: gw=%v self=%v", m.gw, m.gwSelfIP)
|
|
}
|
|
return gw, myIP, ok
|
|
}
|
|
|
|
// RegisterChangeCallback adds callback to the set of parties to be
|
|
// notified (in their own goroutine) when the network state changes.
|
|
// To remove this callback, call unregister (or close the monitor).
|
|
func (m *Monitor) RegisterChangeCallback(callback ChangeFunc) (unregister func()) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
handle := m.cbs.Add(callback)
|
|
return func() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
delete(m.cbs, handle)
|
|
}
|
|
}
|
|
|
|
// RuleDeleteCallback is a callback when a Linux IP policy routing
|
|
// rule is deleted. The table is the table number (52, 253, 354) and
|
|
// priority is the priority order number (for Tailscale rules
|
|
// currently: 5210, 5230, 5250, 5270)
|
|
type RuleDeleteCallback func(table uint8, priority uint32)
|
|
|
|
// RegisterRuleDeleteCallback adds callback to the set of parties to be
|
|
// notified (in their own goroutine) when a Linux ip rule is deleted.
|
|
// To remove this callback, call unregister (or close the monitor).
|
|
func (m *Monitor) RegisterRuleDeleteCallback(callback RuleDeleteCallback) (unregister func()) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
handle := m.ruleDelCB.Add(callback)
|
|
return func() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
delete(m.ruleDelCB, handle)
|
|
}
|
|
}
|
|
|
|
// Start starts the monitor.
|
|
// A monitor can only be started & closed once.
|
|
func (m *Monitor) Start() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
if m.started || m.closed {
|
|
return
|
|
}
|
|
m.started = true
|
|
|
|
if shouldMonitorTimeJump {
|
|
m.wallTimer = time.AfterFunc(pollWallTimeInterval, m.pollWallTime)
|
|
}
|
|
|
|
if m.om == nil {
|
|
return
|
|
}
|
|
m.goroutines.Add(2)
|
|
go m.pump()
|
|
go m.debounce()
|
|
}
|
|
|
|
// Close closes the monitor.
|
|
func (m *Monitor) Close() error {
|
|
m.mu.Lock()
|
|
if m.closed {
|
|
m.mu.Unlock()
|
|
return nil
|
|
}
|
|
m.closed = true
|
|
close(m.stop)
|
|
|
|
if m.wallTimer != nil {
|
|
m.wallTimer.Stop()
|
|
}
|
|
|
|
var err error
|
|
if m.om != nil {
|
|
err = m.om.Close()
|
|
}
|
|
|
|
started := m.started
|
|
m.mu.Unlock()
|
|
|
|
if started {
|
|
m.goroutines.Wait()
|
|
}
|
|
return err
|
|
}
|
|
|
|
// InjectEvent forces the monitor to pretend there was a network
|
|
// change and re-check the state of the network. Any registered
|
|
// ChangeFunc callbacks will be called within the event coalescing
|
|
// period (under a fraction of a second).
|
|
func (m *Monitor) InjectEvent() {
|
|
select {
|
|
case m.change <- true:
|
|
default:
|
|
// Another change signal is already
|
|
// buffered. Debounce will wake up soon
|
|
// enough.
|
|
}
|
|
}
|
|
|
|
// Poll forces the monitor to pretend there was a network
|
|
// change and re-check the state of the network.
|
|
//
|
|
// This is like InjectEvent but only fires ChangeFunc callbacks
|
|
// if the network state differed at all.
|
|
func (m *Monitor) Poll() {
|
|
select {
|
|
case m.change <- false:
|
|
default:
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) stopped() bool {
|
|
select {
|
|
case <-m.stop:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// pump continuously retrieves messages from the connection, notifying
|
|
// the change channel of changes, and stopping when a stop is issued.
|
|
func (m *Monitor) pump() {
|
|
defer m.goroutines.Done()
|
|
for !m.stopped() {
|
|
msg, err := m.om.Receive()
|
|
if err != nil {
|
|
if m.stopped() {
|
|
return
|
|
}
|
|
// Keep retrying while we're not closed.
|
|
m.logf("error from link monitor: %v", err)
|
|
time.Sleep(time.Second)
|
|
continue
|
|
}
|
|
if rdm, ok := msg.(ipRuleDeletedMessage); ok {
|
|
m.notifyRuleDeleted(rdm)
|
|
continue
|
|
}
|
|
if msg.ignore() {
|
|
continue
|
|
}
|
|
m.Poll()
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) notifyRuleDeleted(rdm ipRuleDeletedMessage) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
for _, cb := range m.ruleDelCB {
|
|
go cb(rdm.table, rdm.priority)
|
|
}
|
|
}
|
|
|
|
// isInterestingInterface reports whether the provided interface should be
|
|
// considered when checking for network state changes.
|
|
// The ips parameter should be the IPs of the provided interface.
|
|
func (m *Monitor) isInterestingInterface(i interfaces.Interface, ips []netip.Prefix) bool {
|
|
if !m.om.IsInterestingInterface(i.Name) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// debounce calls the callback function with a delay between events
|
|
// and exits when a stop is issued.
|
|
func (m *Monitor) debounce() {
|
|
defer m.goroutines.Done()
|
|
for {
|
|
var forceCallbacks bool
|
|
select {
|
|
case <-m.stop:
|
|
return
|
|
case forceCallbacks = <-m.change:
|
|
}
|
|
|
|
if newState, err := m.interfaceStateUncached(); err != nil {
|
|
m.logf("interfaces.State: %v", err)
|
|
} else {
|
|
m.handlePotentialChange(newState, forceCallbacks)
|
|
}
|
|
|
|
select {
|
|
case <-m.stop:
|
|
return
|
|
case <-time.After(250 * time.Millisecond):
|
|
}
|
|
}
|
|
}
|
|
|
|
var (
|
|
metricChangeEq = clientmetric.NewCounter("netmon_link_change_eq")
|
|
metricChange = clientmetric.NewCounter("netmon_link_change")
|
|
metricChangeTimeJump = clientmetric.NewCounter("netmon_link_change_timejump")
|
|
metricChangeMajor = clientmetric.NewCounter("netmon_link_change_major")
|
|
)
|
|
|
|
// handlePotentialChange considers whether newState is different enough to wake
|
|
// up callers and updates the monitor's state if so.
|
|
//
|
|
// If forceCallbacks is true, they're always notified.
|
|
func (m *Monitor) handlePotentialChange(newState *interfaces.State, forceCallbacks bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
oldState := m.ifState
|
|
timeJumped := shouldMonitorTimeJump && m.checkWallTimeAdvanceLocked()
|
|
if !timeJumped && !forceCallbacks && oldState.Equal(newState) {
|
|
// Exactly equal. Nothing to do.
|
|
metricChangeEq.Add(1)
|
|
return
|
|
}
|
|
|
|
delta := &ChangeDelta{
|
|
Monitor: m,
|
|
Old: oldState,
|
|
New: newState,
|
|
TimeJumped: timeJumped,
|
|
}
|
|
|
|
delta.Major = m.IsMajorChangeFrom(oldState, newState)
|
|
if delta.Major {
|
|
m.gwValid = false
|
|
m.ifState = newState
|
|
|
|
if s1, s2 := oldState.String(), delta.New.String(); s1 == s2 {
|
|
m.logf("[unexpected] network state changed, but stringification didn't: %v", s1)
|
|
m.logf("[unexpected] old: %s", jsonSummary(oldState))
|
|
m.logf("[unexpected] new: %s", jsonSummary(newState))
|
|
}
|
|
}
|
|
// See if we have a queued or new time jump signal.
|
|
if timeJumped {
|
|
m.resetTimeJumpedLocked()
|
|
if !delta.Major {
|
|
// Only log if it wasn't an interesting change.
|
|
m.logf("time jumped (probably wake from sleep); synthesizing major change event")
|
|
delta.Major = true
|
|
}
|
|
}
|
|
metricChange.Add(1)
|
|
if delta.Major {
|
|
metricChangeMajor.Add(1)
|
|
}
|
|
if delta.TimeJumped {
|
|
metricChangeTimeJump.Add(1)
|
|
}
|
|
for _, cb := range m.cbs {
|
|
go cb(delta)
|
|
}
|
|
}
|
|
|
|
// IsMajorChangeFrom reports whether the transition from s1 to s2 is
|
|
// a "major" change, where major roughly means it's worth tearing down
|
|
// a bunch of connections and rebinding.
|
|
//
|
|
// TODO(bradiftz): tigten this definition.
|
|
func (m *Monitor) IsMajorChangeFrom(s1, s2 *interfaces.State) bool {
|
|
if s1 == nil && s2 == nil {
|
|
return false
|
|
}
|
|
if s1 == nil || s2 == nil {
|
|
return true
|
|
}
|
|
if s1.HaveV6 != s2.HaveV6 ||
|
|
s1.HaveV4 != s2.HaveV4 ||
|
|
s1.IsExpensive != s2.IsExpensive ||
|
|
s1.DefaultRouteInterface != s2.DefaultRouteInterface ||
|
|
s1.HTTPProxy != s2.HTTPProxy ||
|
|
s1.PAC != s2.PAC {
|
|
return true
|
|
}
|
|
for iname, i := range s1.Interface {
|
|
if iname == m.tsIfName {
|
|
// Ignore changes in the Tailscale interface itself.
|
|
continue
|
|
}
|
|
ips := s1.InterfaceIPs[iname]
|
|
if !m.isInterestingInterface(i, ips) {
|
|
continue
|
|
}
|
|
i2, ok := s2.Interface[iname]
|
|
if !ok {
|
|
return true
|
|
}
|
|
ips2, ok := s2.InterfaceIPs[iname]
|
|
if !ok {
|
|
return true
|
|
}
|
|
if !i.Equal(i2) || !prefixesMajorEqual(ips, ips2) {
|
|
return true
|
|
}
|
|
}
|
|
// Iterate over s2 in case there is a field in s2 that doesn't exist in s1
|
|
for iname, i := range s2.Interface {
|
|
if iname == m.tsIfName {
|
|
// Ignore changes in the Tailscale interface itself.
|
|
continue
|
|
}
|
|
ips := s2.InterfaceIPs[iname]
|
|
if !m.isInterestingInterface(i, ips) {
|
|
continue
|
|
}
|
|
i1, ok := s1.Interface[iname]
|
|
if !ok {
|
|
return true
|
|
}
|
|
ips1, ok := s1.InterfaceIPs[iname]
|
|
if !ok {
|
|
return true
|
|
}
|
|
if !i.Equal(i1) || !prefixesMajorEqual(ips, ips1) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// prefixesMajorEqual reports whether a and b are equal after ignoring
|
|
// boring things like link-local, loopback, and multicast addresses.
|
|
func prefixesMajorEqual(a, b []netip.Prefix) bool {
|
|
// trim returns a subslice of p with link local unicast,
|
|
// loopback, and multicast prefixes removed from the front.
|
|
trim := func(p []netip.Prefix) []netip.Prefix {
|
|
for len(p) > 0 {
|
|
a := p[0].Addr()
|
|
if a.IsLinkLocalUnicast() || a.IsLoopback() || a.IsMulticast() {
|
|
p = p[1:]
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
return p
|
|
}
|
|
for {
|
|
a = trim(a)
|
|
b = trim(b)
|
|
if len(a) == 0 || len(b) == 0 {
|
|
return len(a) == 0 && len(b) == 0
|
|
}
|
|
if a[0] != b[0] {
|
|
return false
|
|
}
|
|
a, b = a[1:], b[1:]
|
|
}
|
|
}
|
|
|
|
func jsonSummary(x any) any {
|
|
j, err := json.Marshal(x)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return j
|
|
}
|
|
|
|
func wallTime() time.Time {
|
|
// From time package's docs: "The canonical way to strip a
|
|
// monotonic clock reading is to use t = t.Round(0)."
|
|
return time.Now().Round(0)
|
|
}
|
|
|
|
func (m *Monitor) pollWallTime() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
if m.closed {
|
|
return
|
|
}
|
|
if m.checkWallTimeAdvanceLocked() {
|
|
m.InjectEvent()
|
|
}
|
|
m.wallTimer.Reset(pollWallTimeInterval)
|
|
}
|
|
|
|
// shouldMonitorTimeJump is whether we keep a regular periodic timer running in
|
|
// the background watching for jumps in wall time.
|
|
//
|
|
// We don't do this on mobile platforms for battery reasons, and because these
|
|
// platforms don't really sleep in the same way.
|
|
const shouldMonitorTimeJump = runtime.GOOS != "android" && runtime.GOOS != "ios"
|
|
|
|
// checkWallTimeAdvanceLocked reports whether wall time jumped more than 150% of
|
|
// pollWallTimeInterval, indicating we probably just came out of sleep. Once a
|
|
// time jump is detected it must be reset by calling resetTimeJumpedLocked.
|
|
func (m *Monitor) checkWallTimeAdvanceLocked() bool {
|
|
if !shouldMonitorTimeJump {
|
|
panic("unreachable") // if callers are correct
|
|
}
|
|
now := wallTime()
|
|
if now.Sub(m.lastWall) > pollWallTimeInterval*3/2 {
|
|
m.timeJumped = true // it is reset by debounce.
|
|
}
|
|
m.lastWall = now
|
|
return m.timeJumped
|
|
}
|
|
|
|
// resetTimeJumpedLocked consumes the signal set by checkWallTimeAdvanceLocked.
|
|
func (m *Monitor) resetTimeJumpedLocked() {
|
|
m.timeJumped = false
|
|
}
|
|
|
|
type ipRuleDeletedMessage struct {
|
|