net/dns: handle D-Bus restarts in resolved manager (#5026)
When dbus restarts it can cause the tailscaled to crash because the nil signal was not handled in resolved.Fixing so the nil signal leads to a connection reset and tailscaled stays connected to systemd when dbus restarted. Fixes #4645 Co-authored-by: James Tucker <james@tailscale.com> Signed-off-by: nyghtowl <warrick@tailscale.com> Co-authored-by: James Tucker <james@tailscale.com>
This commit is contained in:
parent
898695e312
commit
f17873e0f4
|
@ -17,6 +17,7 @@ type OSConfigurator interface {
|
|||
// If cfg is the zero value, all Tailscale-related DNS
|
||||
// configuration is removed.
|
||||
// SetDNS must not be called after Close.
|
||||
// SetDNS takes ownership of cfg.
|
||||
SetDNS(cfg OSConfig) error
|
||||
// SupportsSplitDNS reports whether the configurator is capable of
|
||||
// installing a resolver only for specific DNS suffixes. If false,
|
||||
|
|
|
@ -13,8 +13,9 @@ import (
|
|||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tailscale.com/logtail/backoff"
|
||||
"github.com/godbus/dbus/v5"
|
||||
"golang.org/x/sys/unix"
|
||||
"inet.af/netaddr"
|
||||
|
@ -74,76 +75,173 @@ type resolvedLinkDomain struct {
|
|||
RoutingOnly bool
|
||||
}
|
||||
|
||||
// changeRequest tracks latest OSConfig and related error responses to update.
|
||||
type changeRequest struct {
|
||||
config OSConfig // configs OSConfigs, one per each SetDNS call
|
||||
res chan <- error // response channel
|
||||
}
|
||||
|
||||
// resolvedManager is an OSConfigurator which uses the systemd-resolved DBus API.
|
||||
type resolvedManager struct {
|
||||
ctx context.Context
|
||||
cancel func() // terminate the context, for close
|
||||
|
||||
logf logger.Logf
|
||||
ifidx int
|
||||
|
||||
cancelSyncer context.CancelFunc // run to shut down syncer goroutine
|
||||
syncerDone chan struct{} // closed when syncer is stopped
|
||||
resolved dbus.BusObject
|
||||
|
||||
mu sync.Mutex // guards RPCs made by syncLocked, and the following
|
||||
config OSConfig // last SetDNS config
|
||||
configCR chan changeRequest // tracks OSConfigs changes and error responses
|
||||
}
|
||||
|
||||
func newResolvedManager(logf logger.Logf, interfaceName string) (*resolvedManager, error) {
|
||||
conn, err := dbus.SystemBus()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
iface, err := net.InterfaceByName(interfaceName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
logf = logger.WithPrefix(logf, "dns: ")
|
||||
|
||||
ret := &resolvedManager{
|
||||
logf: logf,
|
||||
ifidx: iface.Index,
|
||||
cancelSyncer: cancel,
|
||||
syncerDone: make(chan struct{}),
|
||||
resolved: conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath)),
|
||||
mgr := &resolvedManager{
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
|
||||
logf: logf,
|
||||
ifidx: iface.Index,
|
||||
|
||||
configCR: make(chan changeRequest),
|
||||
}
|
||||
signals := make(chan *dbus.Signal, 16)
|
||||
go ret.resync(ctx, signals)
|
||||
// Only receive the DBus signals we need to resync our config on
|
||||
// resolved restart. Failure to set filters isn't a fatal error,
|
||||
// we'll just receive all broadcast signals and have to ignore
|
||||
// them on our end.
|
||||
if err := conn.AddMatchSignal(dbus.WithMatchObjectPath(dbusPath), dbus.WithMatchInterface(dbusInterface), dbus.WithMatchMember(dbusOwnerSignal), dbus.WithMatchArg(0, dbusResolvedObject)); err != nil {
|
||||
logf("[v1] Setting DBus signal filter failed: %v", err)
|
||||
}
|
||||
conn.Signal(signals)
|
||||
return ret, nil
|
||||
|
||||
go mgr.run(ctx)
|
||||
|
||||
return mgr, nil
|
||||
}
|
||||
|
||||
func (m *resolvedManager) SetDNS(config OSConfig) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
errc := make(chan error, 1)
|
||||
defer close(errc)
|
||||
|
||||
m.config = config
|
||||
return m.syncLocked(context.TODO()) // would be nice to plumb context through from SetDNS
|
||||
select {
|
||||
case <-m.ctx.Done():
|
||||
return m.ctx.Err()
|
||||
case m.configCR <- changeRequest{config, errc}:
|
||||
}
|
||||
|
||||
select {
|
||||
case <-m.ctx.Done():
|
||||
return m.ctx.Err()
|
||||
case err := <-errc:
|
||||
if err != nil {
|
||||
m.logf("failed to configure resolved: %v", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal) {
|
||||
defer close(m.syncerDone)
|
||||
func (m *resolvedManager) run(ctx context.Context) {
|
||||
var (
|
||||
conn *dbus.Conn
|
||||
signals chan *dbus.Signal
|
||||
rManager dbus.BusObject // rManager is the Resolved DBus connection
|
||||
)
|
||||
bo := backoff.NewBackoff("resolved-dbus", m.logf, 30*time.Second)
|
||||
needsReconnect := make(chan bool, 1)
|
||||
defer func() {
|
||||
if conn != nil {
|
||||
conn.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
// Reconnect the systemBus if disconnected.
|
||||
reconnect := func() error {
|
||||
var err error
|
||||
signals = make(chan *dbus.Signal, 16)
|
||||
conn, err = dbus.SystemBus()
|
||||
if err != nil {
|
||||
m.logf("dbus connection error: %v", err)
|
||||
} else {
|
||||
m.logf("[v1] dbus connected")
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// Backoff increases time between reconnect attempts.
|
||||
go func() {
|
||||
bo.BackOff(ctx, err)
|
||||
needsReconnect <- true
|
||||
}()
|
||||
return err
|
||||
}
|
||||
|
||||
rManager = conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath))
|
||||
|
||||
// Only receive the DBus signals we need to resync our config on
|
||||
// resolved restart. Failure to set filters isn't a fatal error,
|
||||
// we'll just receive all broadcast signals and have to ignore
|
||||
// them on our end.
|
||||
if err = conn.AddMatchSignal(dbus.WithMatchObjectPath(dbusPath), dbus.WithMatchInterface(dbusInterface), dbus.WithMatchMember(dbusOwnerSignal), dbus.WithMatchArg(0, dbusResolvedObject)); err != nil {
|
||||
m.logf("[v1] Setting DBus signal filter failed: %v", err)
|
||||
}
|
||||
conn.Signal(signals)
|
||||
|
||||
// Reset backoff and SetNSOSHealth after successful on reconnect.
|
||||
bo.BackOff(ctx, nil)
|
||||
health.SetDNSOSHealth(nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create initial systemBus connection.
|
||||
reconnect()
|
||||
|
||||
lastConfig := OSConfig{}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if rManager == nil {
|
||||
return
|
||||
}
|
||||
// RevertLink resets all per-interface settings on systemd-resolved to defaults.
|
||||
// When ctx goes away systemd-resolved auto reverts.
|
||||
// Keeping for potential use in future refactor.
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil {
|
||||
m.logf("[v1] RevertLink: %w", call.Err)
|
||||
return
|
||||
}
|
||||
return
|
||||
case signal := <-signals:
|
||||
case configCR := <-m.configCR:
|
||||
// Track and update sync with latest config change.
|
||||
lastConfig = configCR.config
|
||||
|
||||
if rManager == nil {
|
||||
configCR.res <- fmt.Errorf("resolved DBus does not have a connection")
|
||||
continue
|
||||
}
|
||||
err := m.setConfigOverDBus(ctx, rManager, configCR.config)
|
||||
configCR.res <- err
|
||||
case <-needsReconnect:
|
||||
if err := reconnect(); err != nil {
|
||||
m.logf("[v1] SystemBus reconnect error %T", err)
|
||||
}
|
||||
continue
|
||||
case signal, ok := <-signals:
|
||||
// If signal ends and is nil then program tries to reconnect.
|
||||
if !ok {
|
||||
if err := reconnect(); err != nil {
|
||||
m.logf("[v1] SystemBus reconnect error %T", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
// In theory the signal was filtered by DBus, but if
|
||||
// AddMatchSignal in the constructor failed, we may be
|
||||
// getting other spam.
|
||||
if signal.Path != dbusPath || signal.Name != dbusInterface+"."+dbusOwnerSignal {
|
||||
continue
|
||||
}
|
||||
if lastConfig.IsZero() {
|
||||
continue
|
||||
}
|
||||
// signal.Body is a []any of 3 strings: bus name, previous owner, new owner.
|
||||
if len(signal.Body) != 3 {
|
||||
m.logf("[unexpectected] DBus NameOwnerChanged len(Body) = %d, want 3")
|
||||
m.logf("[unexpected] DBus NameOwnerChanged len(Body) = %d, want 3")
|
||||
}
|
||||
if name, ok := signal.Body[0].(string); !ok || name != dbusResolvedObject {
|
||||
continue
|
||||
|
@ -160,13 +258,11 @@ func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal)
|
|||
// The resolved bus name has a new owner, meaning resolved
|
||||
// restarted. Reprogram current config.
|
||||
m.logf("systemd-resolved restarted, syncing DNS config")
|
||||
m.mu.Lock()
|
||||
err := m.syncLocked(ctx)
|
||||
err := m.setConfigOverDBus(ctx, rManager, lastConfig)
|
||||
// Set health while holding the lock, because this will
|
||||
// graciously serialize the resync's health outcome with a
|
||||
// concurrent SetDNS call.
|
||||
health.SetDNSOSHealth(err)
|
||||
m.mu.Unlock()
|
||||
if err != nil {
|
||||
m.logf("failed to configure systemd-resolved: %v", err)
|
||||
}
|
||||
|
@ -174,12 +270,13 @@ func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal)
|
|||
}
|
||||
}
|
||||
|
||||
func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
||||
// setConfigOverDBus updates resolved DBus config and is only called from the run goroutine.
|
||||
func (m *resolvedManager) setConfigOverDBus(ctx context.Context, rManager dbus.BusObject, config OSConfig) error {
|
||||
ctx, cancel := context.WithTimeout(ctx, reconfigTimeout)
|
||||
defer cancel()
|
||||
|
||||
var linkNameservers = make([]resolvedLinkNameserver, len(m.config.Nameservers))
|
||||
for i, server := range m.config.Nameservers {
|
||||
var linkNameservers = make([]resolvedLinkNameserver, len(config.Nameservers))
|
||||
for i, server := range config.Nameservers {
|
||||
ip := server.As16()
|
||||
if server.Is4() {
|
||||
linkNameservers[i] = resolvedLinkNameserver{
|
||||
|
@ -193,18 +290,16 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
err := m.resolved.CallWithContext(
|
||||
err := rManager.CallWithContext(
|
||||
ctx, dbusResolvedInterface+".SetLinkDNS", 0,
|
||||
m.ifidx, linkNameservers,
|
||||
).Store()
|
||||
if err != nil {
|
||||
return fmt.Errorf("setLinkDNS: %w", err)
|
||||
}
|
||||
|
||||
linkDomains := make([]resolvedLinkDomain, 0, len(m.config.SearchDomains)+len(m.config.MatchDomains))
|
||||
linkDomains := make([]resolvedLinkDomain, 0, len(config.SearchDomains)+len(config.MatchDomains))
|
||||
seenDomains := map[dnsname.FQDN]bool{}
|
||||
for _, domain := range m.config.SearchDomains {
|
||||
for _, domain := range config.SearchDomains {
|
||||
if seenDomains[domain] {
|
||||
continue
|
||||
}
|
||||
|
@ -214,7 +309,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
RoutingOnly: false,
|
||||
})
|
||||
}
|
||||
for _, domain := range m.config.MatchDomains {
|
||||
for _, domain := range config.MatchDomains {
|
||||
if seenDomains[domain] {
|
||||
// Search domains act as both search and match in
|
||||
// resolved, so it's correct to skip.
|
||||
|
@ -226,7 +321,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
RoutingOnly: true,
|
||||
})
|
||||
}
|
||||
if len(m.config.MatchDomains) == 0 && len(m.config.Nameservers) > 0 {
|
||||
if len(config.MatchDomains) == 0 && len(config.Nameservers) > 0 {
|
||||
// Caller requested full DNS interception, install a
|
||||
// routing-only root domain.
|
||||
linkDomains = append(linkDomains, resolvedLinkDomain{
|
||||
|
@ -235,14 +330,14 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
})
|
||||
}
|
||||
|
||||
err = m.resolved.CallWithContext(
|
||||
err = rManager.CallWithContext(
|
||||
ctx, dbusResolvedInterface+".SetLinkDomains", 0,
|
||||
m.ifidx, linkDomains,
|
||||
).Store()
|
||||
if err != nil && err.Error() == "Argument list too long" { // TODO: better error match
|
||||
// Issue 3188: older systemd-resolved had argument length limits.
|
||||
// Trim out the *.arpa. entries and try again.
|
||||
err = m.resolved.CallWithContext(
|
||||
err = rManager.CallWithContext(
|
||||
ctx, dbusResolvedInterface+".SetLinkDomains", 0,
|
||||
m.ifidx, linkDomainsWithoutReverseDNS(linkDomains),
|
||||
).Store()
|
||||
|
@ -251,7 +346,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
return fmt.Errorf("setLinkDomains: %w", err)
|
||||
}
|
||||
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(m.config.MatchDomains) == 0); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(config.MatchDomains) == 0); call.Err != nil {
|
||||
if dbusErr, ok := call.Err.(dbus.Error); ok && dbusErr.Name == dbus.ErrMsgUnknownMethod.Name {
|
||||
// on some older systems like Kubuntu 18.04.6 with systemd 237 method SetLinkDefaultRoute is absent,
|
||||
// but otherwise it's working good
|
||||
|
@ -266,29 +361,28 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error {
|
|||
// or something).
|
||||
|
||||
// Disable LLMNR, we don't do multicast.
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil {
|
||||
m.logf("[v1] failed to disable LLMNR: %v", call.Err)
|
||||
}
|
||||
|
||||
// Disable mdns.
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil {
|
||||
m.logf("[v1] failed to disable mdns: %v", call.Err)
|
||||
}
|
||||
|
||||
// We don't support dnssec consistently right now, force it off to
|
||||
// avoid partial failures when we split DNS internally.
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil {
|
||||
m.logf("[v1] failed to disable DNSSEC: %v", call.Err)
|
||||
}
|
||||
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil {
|
||||
m.logf("[v1] failed to disable DoT: %v", call.Err)
|
||||
}
|
||||
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil {
|
||||
if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil {
|
||||
m.logf("failed to flush resolved DNS cache: %v", call.Err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -301,20 +395,7 @@ func (m *resolvedManager) GetBaseConfig() (OSConfig, error) {
|
|||
}
|
||||
|
||||
func (m *resolvedManager) Close() error {
|
||||
m.cancelSyncer()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), reconfigTimeout)
|
||||
defer cancel()
|
||||
if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil {
|
||||
return fmt.Errorf("RevertLink: %w", call.Err)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-m.syncerDone:
|
||||
case <-ctx.Done():
|
||||
m.logf("timeout in systemd-resolved syncer shutdown")
|
||||
}
|
||||
|
||||
m.cancel() // stops the 'run' method goroutine
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue