cmd/{containerboot,k8s-operator}: store proxy device ID early to help with cleanup for broken proxies (#12425)
* cmd/containerboot: store device ID before setting up proxy routes. For containerboot instances whose state needs to be stored in a Kubernetes Secret, we additonally store the device's ID, FQDN and IPs. This is used, between other, by the Kubernetes operator, who uses the ID to delete the device when resources need cleaning up and writes the FQDN and IPs on various kube resource statuses for visibility. This change shifts storing device ID earlier in the proxy setup flow, to ensure that if proxy routing setup fails, the device can still be deleted. Updates tailscale/tailscale#12146 Signed-off-by: Irbe Krumina <irbe@tailscale.com> * code review feedback Signed-off-by: Irbe Krumina <irbe@tailscale.com> --------- Signed-off-by: Irbe Krumina <irbe@tailscale.com>
This commit is contained in:
parent
674c998e93
commit
8cc2738609
|
@ -19,22 +19,20 @@ import (
|
|||
"tailscale.com/tailcfg"
|
||||
)
|
||||
|
||||
// storeDeviceInfo writes deviceID into the "device_id" data field of the kube
|
||||
// secret secretName.
|
||||
func storeDeviceInfo(ctx context.Context, secretName string, deviceID tailcfg.StableNodeID, fqdn string, addresses []netip.Prefix) error {
|
||||
// First check if the secret exists at all. Even if running on
|
||||
// kubernetes, we do not necessarily store state in a k8s secret.
|
||||
if _, err := kc.GetSecret(ctx, secretName); err != nil {
|
||||
if s, ok := err.(*kube.Status); ok {
|
||||
if s.Code >= 400 && s.Code <= 499 {
|
||||
// Assume the secret doesn't exist, or we don't have
|
||||
// permission to access it.
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return err
|
||||
// storeDeviceID writes deviceID to 'device_id' data field of the named
|
||||
// Kubernetes Secret.
|
||||
func storeDeviceID(ctx context.Context, secretName string, deviceID tailcfg.StableNodeID) error {
|
||||
s := &kube.Secret{
|
||||
Data: map[string][]byte{
|
||||
"device_id": []byte(deviceID),
|
||||
},
|
||||
}
|
||||
return kc.StrategicMergePatchSecret(ctx, secretName, s, "tailscale-container")
|
||||
}
|
||||
|
||||
// storeDeviceEndpoints writes device's tailnet IPs and MagicDNS name to fields
|
||||
// 'device_ips', 'device_fqdn' of the named Kubernetes Secret.
|
||||
func storeDeviceEndpoints(ctx context.Context, secretName string, fqdn string, addresses []netip.Prefix) error {
|
||||
var ips []string
|
||||
for _, addr := range addresses {
|
||||
ips = append(ips, addr.Addr().String())
|
||||
|
@ -44,14 +42,13 @@ func storeDeviceInfo(ctx context.Context, secretName string, deviceID tailcfg.St
|
|||
return err
|
||||
}
|
||||
|
||||
m := &kube.Secret{
|
||||
s := &kube.Secret{
|
||||
Data: map[string][]byte{
|
||||
"device_id": []byte(deviceID),
|
||||
"device_fqdn": []byte(fqdn),
|
||||
"device_ips": deviceIPs,
|
||||
},
|
||||
}
|
||||
return kc.StrategicMergePatchSecret(ctx, secretName, m, "tailscale-container")
|
||||
return kc.StrategicMergePatchSecret(ctx, secretName, s, "tailscale-container")
|
||||
}
|
||||
|
||||
// deleteAuthKey deletes the 'authkey' field of the given kube
|
||||
|
|
|
@ -321,7 +321,7 @@ authLoop:
|
|||
}
|
||||
}
|
||||
|
||||
if cfg.InKubernetes && cfg.KubeSecret != "" && cfg.KubernetesCanPatch && isTwoStepConfigAuthOnce(cfg) {
|
||||
if hasKubeStateStore(cfg) && isTwoStepConfigAuthOnce(cfg) {
|
||||
// We were told to only auth once, so any secret-bound
|
||||
// authkey is no longer needed. We don't strictly need to
|
||||
// wipe it, but it's good hygiene.
|
||||
|
@ -337,11 +337,10 @@ authLoop:
|
|||
}
|
||||
|
||||
var (
|
||||
wantProxy = cfg.ProxyTargetIP != "" || cfg.ProxyTargetDNSName != "" || cfg.TailnetTargetIP != "" || cfg.TailnetTargetFQDN != "" || cfg.AllowProxyingClusterTrafficViaIngress
|
||||
wantDeviceInfo = cfg.InKubernetes && cfg.KubeSecret != "" && cfg.KubernetesCanPatch
|
||||
startupTasksDone = false
|
||||
currentIPs deephash.Sum // tailscale IPs assigned to device
|
||||
currentDeviceInfo deephash.Sum // device ID and fqdn
|
||||
startupTasksDone = false
|
||||
currentIPs deephash.Sum // tailscale IPs assigned to device
|
||||
currentDeviceID deephash.Sum // device ID
|
||||
currentDeviceEndpoints deephash.Sum // device FQDN and IPs
|
||||
|
||||
currentEgressIPs deephash.Sum
|
||||
|
||||
|
@ -355,7 +354,7 @@ authLoop:
|
|||
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
|
||||
}
|
||||
var nfr linuxfw.NetfilterRunner
|
||||
if wantProxy {
|
||||
if isL3Proxy(cfg) {
|
||||
nfr, err = newNetfilterRunner(log.Printf)
|
||||
if err != nil {
|
||||
log.Fatalf("error creating new netfilter runner: %v", err)
|
||||
|
@ -440,6 +439,20 @@ runLoop:
|
|||
newCurrentIPs := deephash.Hash(&addrs)
|
||||
ipsHaveChanged := newCurrentIPs != currentIPs
|
||||
|
||||
// Store device ID in a Kubernetes Secret before
|
||||
// setting up any routing rules. This ensures
|
||||
// that, for containerboot instances that are
|
||||
// Kubernetes operator proxies, the operator is
|
||||
// able to retrieve the device ID from the
|
||||
// Kubernetes Secret to clean up tailnet nodes
|
||||
// for proxies whose route setup continuously
|
||||
// fails.
|
||||
deviceID := n.NetMap.SelfNode.StableID()
|
||||
if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceID, &deviceID) {
|
||||
if err := storeDeviceID(ctx, cfg.KubeSecret, n.NetMap.SelfNode.StableID()); err != nil {
|
||||
log.Fatalf("storing device ID in Kubernetes Secret: %v", err)
|
||||
}
|
||||
}
|
||||
if cfg.TailnetTargetFQDN != "" {
|
||||
var (
|
||||
egressAddrs []netip.Prefix
|
||||
|
@ -533,15 +546,36 @@ runLoop:
|
|||
}
|
||||
currentIPs = newCurrentIPs
|
||||
|
||||
deviceInfo := []any{n.NetMap.SelfNode.StableID(), n.NetMap.SelfNode.Name()}
|
||||
if cfg.InKubernetes && cfg.KubernetesCanPatch && cfg.KubeSecret != "" && deephash.Update(¤tDeviceInfo, &deviceInfo) {
|
||||
if err := storeDeviceInfo(ctx, cfg.KubeSecret, n.NetMap.SelfNode.StableID(), n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses().AsSlice()); err != nil {
|
||||
log.Fatalf("storing device ID in kube secret: %v", err)
|
||||
// Only store device FQDN and IP addresses to
|
||||
// Kubernetes Secret when any required proxy
|
||||
// route setup has succeeded. IPs and FQDN are
|
||||
// read from the Secret by the Tailscale
|
||||
// Kubernetes operator and, for some proxy
|
||||
// types, such as Tailscale Ingress, advertized
|
||||
// on the Ingress status. Writing them to the
|
||||
// Secret only after the proxy routing has been
|
||||
// set up ensures that the operator does not
|
||||
// advertize endpoints of broken proxies.
|
||||
// TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'.
|
||||
deviceEndpoints := []any{n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses()}
|
||||
if hasKubeStateStore(cfg) && deephash.Update(¤tDeviceEndpoints, &deviceEndpoints) {
|
||||
if err := storeDeviceEndpoints(ctx, cfg.KubeSecret, n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses().AsSlice()); err != nil {
|
||||
log.Fatalf("storing device IPs and FQDN in Kubernetes Secret: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !startupTasksDone {
|
||||
if (!wantProxy || currentIPs != deephash.Sum{}) && (!wantDeviceInfo || currentDeviceInfo != deephash.Sum{}) {
|
||||
// For containerboot instances that act as TCP
|
||||
// proxies (proxying traffic to an endpoint
|
||||
// passed via one of the env vars that
|
||||
// containerbot reads) and store state in a
|
||||
// Kubernetes Secret, we consider startup tasks
|
||||
// done at the point when device info has been
|
||||
// successfully stored to state Secret.
|
||||
// For all other containerboot instances, if we
|
||||
// just get to this point the startup tasks can
|
||||
// be considered done.
|
||||
if !isL3Proxy(cfg) || !hasKubeStateStore(cfg) || (currentDeviceEndpoints != deephash.Sum{} && currentDeviceID != deephash.Sum{}) {
|
||||
// This log message is used in tests to detect when all
|
||||
// post-auth configuration is done.
|
||||
log.Println("Startup complete, waiting for shutdown signal")
|
||||
|
@ -1287,6 +1321,19 @@ func isOneStepConfig(cfg *settings) bool {
|
|||
return cfg.TailscaledConfigFilePath != ""
|
||||
}
|
||||
|
||||
// isL3Proxy returns true if the Tailscale node needs to be configured to act
|
||||
// as an L3 proxy, proxying to an endpoint provided via one of the config env
|
||||
// vars.
|
||||
func isL3Proxy(cfg *settings) bool {
|
||||
return cfg.ProxyTargetIP != "" || cfg.ProxyTargetDNSName != "" || cfg.TailnetTargetIP != "" || cfg.TailnetTargetFQDN != "" || cfg.AllowProxyingClusterTrafficViaIngress
|
||||
}
|
||||
|
||||
// hasKubeStateStore returns true if the state must be stored in a Kubernetes
|
||||
// Secret.
|
||||
func hasKubeStateStore(cfg *settings) bool {
|
||||
return cfg.InKubernetes && cfg.KubernetesCanPatch && cfg.KubeSecret != ""
|
||||
}
|
||||
|
||||
// tailscaledConfigFilePath returns the path to the tailscaled config file that
|
||||
// should be used for the current capability version. It is determined by the
|
||||
// TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR environment variable and looks for a
|
||||
|
|
|
@ -406,8 +406,10 @@ func sanitizeConfigBytes(c ipn.ConfigVAlpha) string {
|
|||
return string(sanitizedBytes)
|
||||
}
|
||||
|
||||
// DeviceInfo returns the device ID and hostname for the Tailscale device
|
||||
// associated with the given labels.
|
||||
// DeviceInfo returns the device ID, hostname and IPs for the Tailscale device
|
||||
// that acts as an operator proxy. It retrieves info from a Kubernetes Secret
|
||||
// labeled with the provided labels.
|
||||
// Either of device ID, hostname and IPs can be empty string if not found in the Secret.
|
||||
func (a *tailscaleSTSReconciler) DeviceInfo(ctx context.Context, childLabels map[string]string) (id tailcfg.StableNodeID, hostname string, ips []string, err error) {
|
||||
sec, err := getSingleObject[corev1.Secret](ctx, a.Client, a.operatorNamespace, childLabels)
|
||||
if err != nil {
|
||||
|
@ -424,7 +426,12 @@ func (a *tailscaleSTSReconciler) DeviceInfo(ctx context.Context, childLabels map
|
|||
// to remove it.
|
||||
hostname = strings.TrimSuffix(string(sec.Data["device_fqdn"]), ".")
|
||||
if hostname == "" {
|
||||
return "", "", nil, nil
|
||||
// Device ID gets stored and retrieved in a different flow than
|
||||
// FQDN and IPs. A device that acts as Kubernetes operator
|
||||
// proxy, but whose route setup has failed might have an device
|
||||
// ID, but no FQDN/IPs. If so, return the ID, to allow the
|
||||
// operator to clean up such devices.
|
||||
return id, "", nil, nil
|
||||
}
|
||||
if rawDeviceIPs, ok := sec.Data["device_ips"]; ok {
|
||||
if err := json.Unmarshal(rawDeviceIPs, &ips); err != nil {
|
||||
|
|
Loading…
Reference in New Issue