tailscale/wgengine/magicsock/magicsock_linux.go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause

package magicsock

import (
	"bytes"
	"context"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"net"
	"net/netip"
	"strings"
	"syscall"
	"time"

	"github.com/mdlayher/socket"
	"golang.org/x/net/bpf"
	"golang.org/x/net/ipv4"
	"golang.org/x/net/ipv6"
	"golang.org/x/sys/cpu"
	"golang.org/x/sys/unix"
	"tailscale.com/disco"
	"tailscale.com/envknob"
	"tailscale.com/net/netns"
	"tailscale.com/types/ipproto"
	"tailscale.com/types/key"
	"tailscale.com/types/logger"
	"tailscale.com/types/nettype"
)

const (
	udpHeaderSize = 8

	// discoMinHeaderSize is the minimum size of the disco header in bytes.
	discoMinHeaderSize = len(disco.Magic) + 32 /* key length */ + disco.NonceLen
)

var (
	// Opt-in for using raw sockets to receive disco traffic; added for
	// #13140 and replaces the older "TS_DEBUG_DISABLE_RAW_DISCO".
	envknobEnableRawDisco = envknob.RegisterBool("TS_ENABLE_RAW_DISCO")
)

// debugRawDiscoReads enables logging of raw disco reads.
var debugRawDiscoReads = envknob.RegisterBool("TS_DEBUG_RAW_DISCO")

// These are our BPF filters that we use for testing packets.
var (
	magicsockFilterV4 = []bpf.Instruction{
		// For raw sockets (with ETH_P_IP set), the BPF program
		// receives the entire IPv4 packet, but not the Ethernet
		// header.

		// Double-check that this is a UDP packet; we shouldn't be
		// seeing anything else given how we create our AF_PACKET
		// socket, but an extra check here is cheap, and matches the
		// check that we do in the IPv6 path.
		bpf.LoadAbsolute{Off: 9, Size: 1},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(ipproto.UDP), SkipTrue: 1, SkipFalse: 0},
		bpf.RetConstant{Val: 0x0},

		// Disco packets are so small they should never get
		// fragmented, and we don't want to handle reassembly.
		bpf.LoadAbsolute{Off: 6, Size: 2},
		// More Fragments bit set means this is part of a fragmented packet.
		bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x2000, SkipTrue: 7, SkipFalse: 0},
		// Non-zero fragment offset with MF=0 means this is the last
		// fragment of packet.
		bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x1fff, SkipTrue: 6, SkipFalse: 0},

		// Load IP header length into X register.
		bpf.LoadMemShift{Off: 0},

		// Verify that we have a packet that's big enough to (possibly)
		// contain a disco packet.
		//
		// The length of an IPv4 disco packet is composed of:
		// - 8 bytes for the UDP header
		// - N bytes for the disco packet header
		//
		// bpf will implicitly return 0 ("skip") if attempting an
		// out-of-bounds load, so we can check the length of the packet
		// loading a byte from that offset here. We subtract 1 byte
		// from the offset to ensure that we accept a packet that's
		// exactly the minimum size.
		//
		// We use LoadIndirect; since we loaded the start of the packet's
		// payload into the X register, above, we don't need to add
		// ipv4.HeaderLen to the offset (and this properly handles IPv4
		// extensions).
		bpf.LoadIndirect{Off: uint32(udpHeaderSize + discoMinHeaderSize - 1), Size: 1},

		// Get the first 4 bytes of the UDP packet, compare with our magic number
		bpf.LoadIndirect{Off: udpHeaderSize, Size: 4},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},

		// Compare the next 2 bytes
		bpf.LoadIndirect{Off: udpHeaderSize + 4, Size: 2},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(discoMagic2), SkipTrue: 0, SkipFalse: 1},

		// Accept the whole packet
		bpf.RetConstant{Val: 0xFFFFFFFF},

		// Skip the packet
		bpf.RetConstant{Val: 0x0},
	}

	// IPv6 is more complicated to filter, since we can have 0-to-N
	// extension headers following the IPv6 header. Since BPF can't
	// loop, we can't really parse these in a general way; instead, we
	// simply handle the case where we have no extension headers; any
	// packets with headers will be skipped. IPv6 extension headers
	// are sufficiently uncommon that we're willing to accept false
	// negatives here.
	//
	// The "proper" way to handle this would be to do minimal parsing in
	// BPF and more in-depth parsing of all IPv6 packets in userspace, but
	// on systems with a high volume of UDP that would be unacceptably slow
	// and thus we'd rather be conservative here and possibly not receive
	// disco packets rather than slow down the system.
	magicsockFilterV6 = []bpf.Instruction{
		// Do a bounds check to ensure we have enough space for a disco
		// packet; see the comment in the IPv4 BPF program for more
		// details.
		bpf.LoadAbsolute{Off: uint32(ipv6.HeaderLen + udpHeaderSize + discoMinHeaderSize - 1), Size: 1},

		// Verify that the 'next header' value of the IPv6 packet is
		// UDP, which is what we're expecting; if it's anything else
		// (including extension headers), we skip the packet.
		bpf.LoadAbsolute{Off: 6, Size: 1},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(ipproto.UDP), SkipTrue: 0, SkipFalse: 5},

		// Compare with our magic number. Start by loading and
		// comparing the first 4 bytes of the UDP payload.
		bpf.LoadAbsolute{Off: ipv6.HeaderLen + udpHeaderSize, Size: 4},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},

		// Compare the next 2 bytes
		bpf.LoadAbsolute{Off: ipv6.HeaderLen + udpHeaderSize + 4, Size: 2},
		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic2, SkipTrue: 0, SkipFalse: 1},

		// Accept the whole packet
		bpf.RetConstant{Val: 0xFFFFFFFF},

		// Skip the packet
		bpf.RetConstant{Val: 0x0},
	}

	testDiscoPacket = []byte{
		// Disco magic
		0x54, 0x53, 0xf0, 0x9f, 0x92, 0xac,
		// Sender key
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		// Nonce
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
	}
)

// listenRawDisco starts listening for disco packets on the given
// address family, which must be "ip4" or "ip6", using a raw socket
// and BPF filter.
// https://github.com/tailscale/tailscale/issues/3824
func (c *Conn) listenRawDisco(family string) (io.Closer, error) {
	if !envknobEnableRawDisco() {
		// Return an 'errors.ErrUnsupported' to prevent the callee from
		// logging; when we switch this to an opt-out (vs. an opt-in),
		// drop the ErrUnsupported so that the callee logs that it was
		// disabled.
		return nil, fmt.Errorf("raw disco not enabled: %w", errors.ErrUnsupported)
	}

	// https://github.com/tailscale/tailscale/issues/5607
	if !netns.UseSocketMark() {
		return nil, errors.New("raw disco listening disabled, SO_MARK unavailable")
	}

	var (
		udpnet   string
		addr     string
		proto    int
		testAddr netip.AddrPort
		prog     []bpf.Instruction
	)
	switch family {
	case "ip4":
		udpnet = "udp4"
		addr = "0.0.0.0"
		proto = ethernetProtoIPv4()
		testAddr = netip.AddrPortFrom(netip.AddrFrom4([4]byte{127, 0, 0, 1}), 1)
		prog = magicsockFilterV4
	case "ip6":
		udpnet = "udp6"
		addr = "::"
		proto = ethernetProtoIPv6()
		testAddr = netip.AddrPortFrom(netip.IPv6Loopback(), 1)
		prog = magicsockFilterV6
	default:
		return nil, fmt.Errorf("unsupported address family %q", family)
	}

	asm, err := bpf.Assemble(prog)
	if err != nil {
		return nil, fmt.Errorf("assembling filter: %w", err)
	}

	sock, err := socket.Socket(
		unix.AF_PACKET,
		unix.SOCK_DGRAM,
		proto,
		"afpacket",
		nil, // no config
	)
	if err != nil {
		return nil, fmt.Errorf("creating AF_PACKET socket: %w", err)
	}

	if err := sock.SetBPF(asm); err != nil {
		sock.Close()
		return nil, fmt.Errorf("installing BPF filter: %w", err)
	}

	// If all the above succeeds, we should be ready to receive. Just
	// out of paranoia, check that we do receive a well-formed disco
	// packet.
	tc, err := net.ListenPacket(udpnet, net.JoinHostPort(addr, "0"))
	if err != nil {
		sock.Close()
		return nil, fmt.Errorf("creating disco test socket: %w", err)
	}
	defer tc.Close()
	if _, err := tc.(*net.UDPConn).WriteToUDPAddrPort(testDiscoPacket, testAddr); err != nil {
		sock.Close()
		return nil, fmt.Errorf("writing disco test packet: %w", err)
	}

	const selfTestTimeout = 100 * time.Millisecond
	if err := sock.SetReadDeadline(time.Now().Add(selfTestTimeout)); err != nil {
		sock.Close()
		return nil, fmt.Errorf("setting socket timeout: %w", err)
	}

	var (
		ctx = context.Background()
		buf [1500]byte
	)
	for {
		n, _, err := sock.Recvfrom(ctx, buf[:], 0)
		if err != nil {
			sock.Close()
			return nil, fmt.Errorf("reading during raw disco self-test: %w", err)
		}

		_ /* src */, _ /* dst */, payload := parseUDPPacket(buf[:n], family == "ip6")
		if payload == nil {
			continue
		}
		if !bytes.Equal(payload, testDiscoPacket) {
			c.discoLogf("listenRawDisco: self-test: received mismatched UDP packet of %d bytes", len(payload))
			continue
		}
		c.logf("[v1] listenRawDisco: self-test passed for %s", family)
		break
	}
	sock.SetReadDeadline(time.Time{})

	go c.receiveDisco(sock, family == "ip6")
	return sock, nil
}

// parseUDPPacket is a basic parser for UDP packets that returns the source and
// destination addresses, and the payload. The returned payload is a sub-slice
// of the input buffer.
//
// It expects to be called with a buffer that contains the entire UDP packet,
// including the IP header, and one that has been filtered with the BPF
// programs above.
//
// If an error occurs, it will return the zero values for all return values.
func parseUDPPacket(buf []byte, isIPv6 bool) (src, dst netip.AddrPort, payload []byte) {
	// First, parse the IPv4 or IPv6 header to get to the UDP header. Since
	// we assume this was filtered with BPF, we know that there will be no
	// IPv6 extension headers.
	var (
		srcIP, dstIP netip.Addr
		udp          []byte
	)
	if isIPv6 {
		// Basic length check to ensure that we don't panic
		if len(buf) < ipv6.HeaderLen+udpHeaderSize {
			return
		}

		// Extract the source and destination addresses from the IPv6
		// header.
		srcIP, _ = netip.AddrFromSlice(buf[8:24])
		dstIP, _ = netip.AddrFromSlice(buf[24:40])

		// We know that the UDP packet starts immediately after the IPv6
		// packet.
		udp = buf[ipv6.HeaderLen:]
	} else {
		// This is an IPv4 packet; read the length field from the header.
		if len(buf) < ipv4.HeaderLen {
			return
		}
		udpOffset := int((buf[0] & 0x0F) << 2)
		if udpOffset+udpHeaderSize > len(buf) {
			return
		}

		// Parse the source and destination IPs.
		srcIP, _ = netip.AddrFromSlice(buf[12:16])
		dstIP, _ = netip.AddrFromSlice(buf[16:20])
		udp = buf[udpOffset:]
	}

	// Parse the ports
	srcPort := binary.BigEndian.Uint16(udp[0:2])
	dstPort := binary.BigEndian.Uint16(udp[2:4])

	// The payload starts after the UDP header.
	payload = udp[8:]
	return netip.AddrPortFrom(srcIP, srcPort), netip.AddrPortFrom(dstIP, dstPort), payload
}

// ethernetProtoIPv4 returns the constant unix.ETH_P_IP, in network byte order.
// packet(7) sockets require that the 'protocol' argument be in network byte
// order; see:
//
//	https://man7.org/linux/man-pages/man7/packet.7.html
//
// Instead of using htons at runtime, we can just hardcode the value here...
// but we also have a test that verifies that this is correct.
func ethernetProtoIPv4() int {
	if cpu.IsBigEndian {
		return 0x0800
	} else {
		return 0x0008
	}
}

// ethernetProtoIPv6 returns the constant unix.ETH_P_IPV6, and is otherwise the
// same as ethernetProtoIPv4.
func ethernetProtoIPv6() int {
	if cpu.IsBigEndian {
		return 0x86dd
	} else {
		return 0xdd86
	}
}

func (c *Conn) discoLogf(format string, args ...any) {
	// Enable debug logging if we're debugging raw disco reads or if the
	// magicsock component logs are on.
	if debugRawDiscoReads() {
		c.logf(format, args...)
	} else {
		c.dlogf(format, args...)
	}
}

func (c *Conn) receiveDisco(pc *socket.Conn, isIPV6 bool) {
	// Given that we're parsing raw packets, be extra careful and recover
	// from any panics in this function.
	//
	// If we didn't have a recover() here and panic'd, we'd take down the
	// entire process since this function is the top of a goroutine, and Go
	// will kill the process if a goroutine panics and it unwinds past the
	// top-level function.
	defer func() {
		if err := recover(); err != nil {
			c.logf("[unexpected] recovered from panic in receiveDisco(isIPv6=%v): %v", isIPV6, err)
		}
	}()

	ctx := context.Background()

	// Set up our loggers
	var family string
	if isIPV6 {
		family = "ip6"
	} else {
		family = "ip4"
	}
	var (
		prefix string      = "disco raw " + family + ": "
		logf   logger.Logf = logger.WithPrefix(c.logf, prefix)
		dlogf  logger.Logf = logger.WithPrefix(c.discoLogf, prefix)
	)

	var buf [1500]byte
	for {
		n, src, err := pc.Recvfrom(ctx, buf[:], 0)
		if debugRawDiscoReads() {
			logf("read from %s = (%v, %v)", printSockaddr(src), n, err)
		}
		if err != nil && (errors.Is(err, net.ErrClosed) || err.Error() == "use of closed file") {
			// EOF; no need to print an error
			return
		} else if err != nil {
			logf("reader failed: %v", err)
			return
		}

		srcAddr, dstAddr, payload := parseUDPPacket(buf[:n], family == "ip6")
		if payload == nil {
			// callee logged
			continue
		}

		dstPort := dstAddr.Port()
		if dstPort == 0 {
			logf("[unexpected] received packet for port 0")
		}

		var acceptPort uint16
		if isIPV6 {
			acceptPort = c.pconn6.Port()
		} else {
			acceptPort = c.pconn4.Port()
		}
		if acceptPort == 0 {
			// This should only typically happen if the receiving address family
			// was recently disabled.
			dlogf("[v1] dropping packet for port %d as acceptPort=0", dstPort)
			continue
		}

		// If the packet isn't destined for our local port, then we
		// should drop it since it might be for another Tailscale
		// process on the same machine, or NATed to a different machine
		// if this is a router, etc.
		//
		// We get the local port to compare against inside the receive
		// loop; we can't cache this beforehand because it can change
		// if/when we rebind.
		if dstPort != acceptPort {
			dlogf("[v1] dropping packet for port %d that isn't our local port", dstPort)
			continue
		}

		if isIPV6 {
			metricRecvDiscoPacketIPv6.Add(1)
		} else {
			metricRecvDiscoPacketIPv4.Add(1)
		}

		c.handleDiscoMessage(payload, srcAddr, key.NodePublic{}, discoRXPathRawSocket)
	}
}

// printSockaddr is a helper function to pretty-print various sockaddr types.
func printSockaddr(sa unix.Sockaddr) string {
	switch sa := sa.(type) {
	case *unix.SockaddrInet4:
		addr := netip.AddrFrom4(sa.Addr)
		return netip.AddrPortFrom(addr, uint16(sa.Port)).String()
	case *unix.SockaddrInet6:
		addr := netip.AddrFrom16(sa.Addr)
		return netip.AddrPortFrom(addr, uint16(sa.Port)).String()
	case *unix.SockaddrLinklayer:
		hwaddr := sa.Addr[:sa.Halen]

		var buf strings.Builder
		fmt.Fprintf(&buf, "link(ty=0x%04x,if=%d):[", sa.Protocol, sa.Ifindex)
		for i, b := range hwaddr {
			if i > 0 {
				buf.WriteByte(':')
			}
			fmt.Fprintf(&buf, "%02x", b)
		}
		buf.WriteByte(']')
		return buf.String()
	default:
		return fmt.Sprintf("unknown(%T)", sa)
	}
}

// trySetSocketBuffer attempts to set SO_SNDBUFFORCE and SO_RECVBUFFORCE which
// can overcome the limit of net.core.{r,w}mem_max, but require CAP_NET_ADMIN.
// It falls back to the portable implementation if that fails, which may be
// silently capped to net.core.{r,w}mem_max.
func trySetSocketBuffer(pconn nettype.PacketConn, logf logger.Logf) {
	if c, ok := pconn.(*net.UDPConn); ok {
		var errRcv, errSnd error
		rc, err := c.SyscallConn()
		if err == nil {
			rc.Control(func(fd uintptr) {
				errRcv = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, socketBufferSize)
				if errRcv != nil {
					logf("magicsock: [warning] failed to force-set UDP read buffer size to %d: %v; using kernel default values (impacts throughput only)", socketBufferSize, errRcv)
				}
				errSnd = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, socketBufferSize)
				if errSnd != nil {
					logf("magicsock: [warning] failed to force-set UDP write buffer size to %d: %v; using kernel default values (impacts throughput only)", socketBufferSize, errSnd)
				}
			})
		}

		if err != nil || errRcv != nil || errSnd != nil {
			portableTrySetSocketBuffer(pconn, logf)
		}
	}
}

var controlMessageSize = -1 // bomb if used for allocation before init

func init() {
	// controlMessageSize is set to hold a UDP_GRO or UDP_SEGMENT control
	// message. These contain a single uint16 of data.
	controlMessageSize = unix.CmsgSpace(2)
}