tailscale/util/deephash/deephash.go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause

// Package deephash hashes a Go value recursively, in a predictable order,
// without looping. The hash is only valid within the lifetime of a program.
// Users should not store the hash on disk or send it over the network.
// The hash is sufficiently strong and unique such that
// Hash(&x) == Hash(&y) is an appropriate replacement for x == y.
//
// The definition of equality is identical to reflect.DeepEqual except:
//   - Floating-point values are compared based on the raw bits,
//     which means that NaNs (with the same bit pattern) are treated as equal.
//   - time.Time are compared based on whether they are the same instant in time
//     and also in the same zone offset. Monotonic measurements and zone names
//     are ignored as part of the hash.
//   - netip.Addr are compared based on a shallow comparison of the struct.
//
// WARNING: This package, like most of the tailscale.com Go module,
// should be considered Tailscale-internal; we make no API promises.
//
// # Cycle detection
//
// This package correctly handles cycles in the value graph,
// but in a way that is potentially pathological in some situations.
//
// The algorithm for cycle detection operates by
// pushing a pointer onto a stack whenever deephash is visiting a pointer and
// popping the pointer from the stack after deephash is leaving the pointer.
// Before visiting a new pointer, deephash checks whether it has already been
// visited on the pointer stack. If so, it hashes the index of the pointer
// on the stack and avoids visiting the pointer.
//
// This algorithm is guaranteed to detect cycles, but may expand pointers
// more often than a potential alternate algorithm that remembers all pointers
// ever visited in a map. The current algorithm uses O(D) memory, where D
// is the maximum depth of the recursion, while the alternate algorithm
// would use O(P) memory where P is all pointers ever seen, which can be a lot,
// and most of which may have nothing to do with cycles.
// Also, the alternate algorithm has to deal with challenges of producing
// deterministic results when pointers are visited in non-deterministic ways
// such as when iterating through a Go map. The stack-based algorithm avoids
// this challenge since the stack is always deterministic regardless of
// non-deterministic iteration order of Go maps.
//
// To concretely see how this algorithm can be pathological,
// consider the following data structure:
//
//	var big *Item = ... // some large data structure that is slow to hash
//	var manyBig []*Item
//	for i := 0; i < 1000; i++ {
//		manyBig = append(manyBig, &big)
//	}
//	deephash.Hash(manyBig)
//
// Here, the manyBig data structure is not even cyclic.
// We have the same big *Item being stored multiple times in a []*Item.
// When deephash hashes []*Item, it hashes each individual *Item
// not realizing that it had just done the computation earlier.
// To avoid the pathological situation, Item should implement [SelfHasher] and
// memoize attempts to hash itself.
package deephash

// TODO: Add option to teach deephash to memoize the Hash result of particular types?

import (
	"crypto/sha256"
	"encoding/binary"
	"encoding/hex"
	"fmt"
	"reflect"
	"sync"
	"time"

	"tailscale.com/util/hashx"
	"tailscale.com/util/set"
)

// There is much overlap between the theory of serialization and hashing.
// A hash (useful for determining equality) can be produced by printing a value
// and hashing the output. The format must:
//	* be deterministic such that the same value hashes to the same output, and
//	* be parsable such that the same value can be reproduced by the output.
//
// The logic below hashes a value by printing it to a hash.Hash.
// To be parsable, it assumes that we know the Go type of each value:
//	* scalar types (e.g., bool or int32) are directly printed as their
//	  underlying memory representation.
//	* list types (e.g., strings and slices) are prefixed by a
//	  fixed-width length field, followed by the contents of the list.
//	* slices, arrays, and structs print each element/field consecutively.
//	* interfaces print with a 1-byte prefix indicating whether it is nil.
//	  If non-nil, it is followed by a fixed-width field of the type index,
//	  followed by the format of the underlying value.
//	* pointers print with a 1-byte prefix indicating whether the pointer is
//	  1) nil, 2) previously seen, or 3) newly seen. Previously seen pointers are
//	  followed by a fixed-width field with the index of the previous pointer.
//	  Newly seen pointers are followed by the format of the underlying value.
//	* maps print with a 1-byte prefix indicating whether the map pointer is
//	  1) nil, 2) previously seen, or 3) newly seen. Previously seen pointers
//	  are followed by a fixed-width field of the index of the previous pointer.
//	  Newly seen maps are printed with a fixed-width length field, followed by
//	  a fixed-width field with the XOR of the hash of every map entry.
//	  With a sufficiently strong hash, this value is theoretically "parsable"
//	  by looking up the hash in a magical map that returns the set of entries
//	  for that given hash.

// SelfHasher is implemented by types that can compute their own hash
// by writing values through the provided [Hasher] parameter.
// Implementations must not leak the provided [Hasher].
//
// If the implementation of SelfHasher recursively calls [deephash.Hash],
// then infinite recursion is quite likely to occur.
// To avoid this, use a type definition to drop methods before calling [deephash.Hash]:
//
//	func (v *MyType) Hash(h deephash.Hasher) {
//		v.hashMu.Lock()
//		defer v.hashMu.Unlock()
//		if v.dirtyHash {
//			type MyTypeWithoutMethods MyType // type define MyType to drop Hash method
//			v.dirtyHash = false              // clear out dirty bit to avoid hashing over it
//			v.hashSum = deephash.Sum{}       // clear out hashSum to avoid hashing over it
//			v.hashSum = deephash.Hash((*MyTypeWithoutMethods)(v))
//		}
//		h.HashSum(v.hashSum)
//	}
//
// In the above example, we acquire a lock since it is possible that deephash
// is called in a concurrent manner, which implies that MyType.Hash may also
// be called in a concurrent manner. Whether this lock is necessary is
// application-dependent and left as an exercise to the reader.
// Also, the example assumes that dirtyHash is set elsewhere by application
// logic whenever a mutation is made to MyType that would alter the hash.
type SelfHasher interface {
	Hash(Hasher)
}

// Hasher is a value passed to [SelfHasher.Hash] that allow implementations
// to hash themselves in a structured manner.
type Hasher struct{ h *hashx.Block512 }

// HashBytes hashes a sequence of bytes b.
// The length of b is not explicitly hashed.
func (h Hasher) HashBytes(b []byte) { h.h.HashBytes(b) }

// HashString hashes the string data of s
// The length of s is not explicitly hashed.
func (h Hasher) HashString(s string) { h.h.HashString(s) }

// HashUint8 hashes a uint8.
func (h Hasher) HashUint8(n uint8) { h.h.HashUint8(n) }

// HashUint16 hashes a uint16.
func (h Hasher) HashUint16(n uint16) { h.h.HashUint16(n) }

// HashUint32 hashes a uint32.
func (h Hasher) HashUint32(n uint32) { h.h.HashUint32(n) }

// HashUint64 hashes a uint64.
func (h Hasher) HashUint64(n uint64) { h.h.HashUint64(n) }

// HashSum hashes a [Sum].
func (h Hasher) HashSum(s Sum) {
	// NOTE: Avoid calling h.HashBytes since it escapes b,
	// which would force s to be heap allocated.
	h.h.HashUint64(binary.LittleEndian.Uint64(s.sum[0:8]))
	h.h.HashUint64(binary.LittleEndian.Uint64(s.sum[8:16]))
	h.h.HashUint64(binary.LittleEndian.Uint64(s.sum[16:24]))
	h.h.HashUint64(binary.LittleEndian.Uint64(s.sum[24:32]))
}

// hasher is reusable state for hashing a value.
// Get one via hasherPool.
type hasher struct {
	hashx.Block512
	visitStack visitStack
}

var hasherPool = &sync.Pool{
	New: func() any { return new(hasher) },
}

func (h *hasher) reset() {
	if h.Block512.Hash == nil {
		h.Block512.Hash = sha256.New()
	}
	h.Block512.Reset()
}

// hashType hashes a reflect.Type.
// The hash is only consistent within the lifetime of a program.
func (h *hasher) hashType(t reflect.Type) {
	// This approach relies on reflect.Type always being backed by a unique
	// *reflect.rtype pointer. A safer approach is to use a global sync.Map
	// that maps reflect.Type to some arbitrary and unique index.
	// While safer, it requires global state with memory that can never be GC'd.
	rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
	h.HashUint64(uint64(rtypeAddr))
}

func (h *hasher) sum() (s Sum) {
	h.Sum(s.sum[:0])
	return s
}

// Sum is an opaque checksum type that is comparable.
type Sum struct {
	sum [sha256.Size]byte
}

func (s1 *Sum) xor(s2 Sum) {
	for i := 0; i < sha256.Size; i++ {
		s1.sum[i] ^= s2.sum[i]
	}
}

func (s Sum) String() string {
	// Note: if we change this, keep in sync with AppendTo
	return hex.EncodeToString(s.sum[:])
}

// AppendTo appends the string encoding of this sum (as returned by the String
// method) to the provided byte slice and returns the extended buffer.
func (s Sum) AppendTo(b []byte) []byte {
	// TODO: switch to upstream implementation if accepted:
	// https://github.com/golang/go/issues/53693
	var lb [len(s.sum) * 2]byte
	hex.Encode(lb[:], s.sum[:])
	return append(b, lb[:]...)
}

var (
	seedOnce sync.Once
	seed     uint64
)

func initSeed() {
	seed = uint64(time.Now().UnixNano())
}

// Hash returns the hash of v.
func Hash[T any](v *T) Sum {
	h := hasherPool.Get().(*hasher)
	defer hasherPool.Put(h)
	h.reset()
	seedOnce.Do(initSeed)
	h.HashUint64(seed)

	// Always treat the Hash input as if it were an interface by including
	// a hash of the type. This ensures that hashing of two different types
	// but with the same value structure produces different hashes.
	t := reflect.TypeFor[T]()
	h.hashType(t)
	if v == nil {
		h.HashUint8(0) // indicates nil
	} else {
		h.HashUint8(1) // indicates visiting pointer element
		p := pointerOf(reflect.ValueOf(v))
		hash := lookupTypeHasher(t)
		hash(h, p)
	}
	return h.sum()
}

// Option is an optional argument to HasherForType.
type Option interface {
	isOption()
}

type fieldFilterOpt struct {
	t              reflect.Type
	fields         set.Set[string]
	includeOnMatch bool // true to include fields, false to exclude them
}

func (fieldFilterOpt) isOption() {}

func (f fieldFilterOpt) filterStructField(sf reflect.StructField) (include bool) {
	if f.fields.Contains(sf.Name) {
		return f.includeOnMatch
	}
	return !f.includeOnMatch
}

// IncludeFields returns an option that modifies the hashing for T to only
// include the named struct fields.
//
// T must be a struct type, and must match the type of the value passed to
// HasherForType.
func IncludeFields[T any](fields ...string) Option {
	return newFieldFilter[T](true, fields)
}

// ExcludeFields returns an option that modifies the hashing for T to include
// all struct fields of T except those provided in fields.
//
// T must be a struct type, and must match the type of the value passed to
// HasherForType.
func ExcludeFields[T any](fields ...string) Option {
	return newFieldFilter[T](false, fields)
}

func newFieldFilter[T any](include bool, fields []string) Option {
	t := reflect.TypeFor[T]()
	fieldSet := set.Set[string]{}
	for _, f := range fields {
		if _, ok := t.FieldByName(f); !ok {
			panic(fmt.Sprintf("unknown field %q for type %v", f, t))
		}
		fieldSet.Add(f)
	}
	return fieldFilterOpt{t, fieldSet, include}
}

// HasherForType returns a hash that is specialized for the provided type.
//
// HasherForType panics if the opts are invalid for the provided type.
//
// Currently, at most one option can be provided (IncludeFields or
// ExcludeFields) and its type must match the type of T. Those restrictions may
// be removed in the future, along with documentation about their precedence
// when combined.
func HasherForType[T any](opts ...Option) func(*T) Sum {
	seedOnce.Do(initSeed)
	if len(opts) > 1 {
		panic("HasherForType only accepts one optional argument") // for now
	}
	t := reflect.TypeFor[T]()
	var hash typeHasherFunc
	for _, o := range opts {
		switch o := o.(type) {
		default:
			panic(fmt.Sprintf("unknown HasherOpt %T", o))
		case fieldFilterOpt:
			if t.Kind() != reflect.Struct {
				panic("HasherForStructTypeWithFieldFilter requires T of kind struct")
			}
			if t != o.t {
				panic(fmt.Sprintf("field filter for type %v does not match HasherForType type %v", o.t, t))
			}
			hash = makeStructHasher(t, o.filterStructField)
		}
	}
	if hash == nil {
		hash = lookupTypeHasher(t)
	}
	return func(v *T) (s Sum) {
		// This logic is identical to Hash, but pull out a few statements.
		h := hasherPool.Get().(*hasher)
		defer hasherPool.Put(h)
		h.reset()
		h.HashUint64(seed)

		h.hashType(t)
		if v == nil {
			h.HashUint8(0) // indicates nil
		} else {
			h.HashUint8(1) // indicates visiting pointer element
			p := pointerOf(reflect.ValueOf(v))
			hash(h, p)
		}
		return h.sum()
	}
}

// Update sets last to the hash of v and reports whether its value changed.
func Update[T any](last *Sum, v *T) (changed bool) {
	sum := Hash(v)
	changed = sum != *last
	if changed {
		*last = sum
	}
	return changed
}

// typeHasherFunc hashes the value pointed at by p for a given type.
// For example, if t is a bool, then p is a *bool.
// The provided pointer must always be non-nil.
type typeHasherFunc func(h *hasher, p pointer)

var typeHasherCache sync.Map // map[reflect.Type]typeHasherFunc

func lookupTypeHasher(t reflect.Type) typeHasherFunc {
	if v, ok := typeHasherCache.Load(t); ok {
		return v.(typeHasherFunc)
	}
	hash := makeTypeHasher(t)
	v, _ := typeHasherCache.LoadOrStore(t, hash)
	return v.(typeHasherFunc)
}

func makeTypeHasher(t reflect.Type) typeHasherFunc {
	// Types with specific hashing.
	switch t {
	case timeTimeType:
		return hashTime
	case netipAddrType:
		return hashAddr
	}

	// Types that implement their own hashing.
	if t.Kind() != reflect.Pointer && t.Kind() != reflect.Interface {
		// A method can be implemented on either the value receiver or pointer receiver.
		if t.Implements(selfHasherType) || reflect.PointerTo(t).Implements(selfHasherType) {
			return makeSelfHasher(t)
		}
	}

	// Types that can have their memory representation directly hashed.
	if typeIsMemHashable(t) {
		return makeMemHasher(t.Size())
	}

	switch t.Kind() {
	case reflect.String:
		return hashString
	case reflect.Array:
		return makeArrayHasher(t)
	case reflect.Slice:
		return makeSliceHasher(t)
	case reflect.Struct:
		return makeStructHasher(t, keepAllStructFields)
	case reflect.Map:
		return makeMapHasher(t)
	case reflect.Pointer:
		return makePointerHasher(t)
	case reflect.Interface:
		return makeInterfaceHasher(t)
	default: // Func, Chan, UnsafePointer
		return func(*hasher, pointer) {}
	}
}

func hashTime(h *hasher, p pointer) {
	// Include the zone offset (but not the name) to keep
	// Hash(t1) == Hash(t2) being semantically equivalent to
	// t1.Format(time.RFC3339Nano) == t2.Format(time.RFC3339Nano).
	t := *p.asTime()
	_, offset := t.Zone()
	h.HashUint64(uint64(t.Unix()))
	h.HashUint32(uint32(t.Nanosecond()))
	h.HashUint32(uint32(offset))
}

func hashAddr(h *hasher, p pointer) {
	// The formatting of netip.Addr covers the
	// IP version, the address, and the optional zone name (for v6).
	// This is equivalent to a1.MarshalBinary() == a2.MarshalBinary().
	ip := *p.asAddr()
	switch {
	case !ip.IsValid():
		h.HashUint64(0)
	case ip.Is4():
		b := ip.As4()
		h.HashUint64(4)
		h.HashUint32(binary.LittleEndian.Uint32(b[:]))
	case ip.Is6():
		b := ip.As16()
		z := ip.Zone()
		h.HashUint64(16 + uint64(len(z)))
		h.HashUint64(binary.LittleEndian.Uint64(b[:8]))
		h.HashUint64(binary.LittleEndian.Uint64(b[8:]))
		h.HashString(z)
	}
}

func makeSelfHasher(t reflect.Type) typeHasherFunc {
	return func(h *hasher, p pointer) {
		p.asValue(t).Interface().(SelfHasher).Hash(Hasher{&h.Block512})
	}
}

func hashString(h *hasher, p pointer) {
	s := *p.asString()
	h.HashUint64(uint64(len(s)))
	h.HashString(s)
}

func makeMemHasher(n uintptr) typeHasherFunc {
	return func(h *hasher, p pointer) {
		h.HashBytes(p.asMemory(n))
	}
}

func makeArrayHasher(t reflect.Type) typeHasherFunc {
	var once sync.Once
	var hashElem typeHasherFunc
	init := func() {
		hashElem = lookupTypeHasher(t.Elem())
	}

	n := t.Len()          // number of array elements
	nb := t.Elem().Size() // byte size of each array element
	return func(h *hasher, p pointer) {
		once.Do(init)
		for i := 0; i < n; i++ {
			hashElem(h, p.arrayIndex(i, nb))
		}
	}
}

func makeSliceHasher(t reflect.Type) typeHasherFunc {
	nb := t.Elem().Size() // byte size of each slice element
	if typeIsMemHashable(t.Elem()) {
		return func(h *hasher, p pointer) {
			pa := p.sliceArray()
			if pa.isNil() {
				h.HashUint8(0) // indicates nil
				return
			}
			h.HashUint8(1) // indicates visiting slice
			n := p.sliceLen()
			b := pa.asMemory(uintptr(n) * nb)
			h.HashUint64(uint64(n))
			h.HashBytes(b)
		}
	}

	var once sync.Once
	var hashElem typeHasherFunc
	init := func() {
		hashElem = lookupTypeHasher(t.Elem())
		if typeIsRecursive(t) {
			hashElemDefault := hashElem
			hashElem = func(h *hasher, p pointer) {
				if idx, ok := h.visitStack.seen(p.p); ok {
					h.HashUint8(2) // indicates cycle
					h.HashUint64(uint64(idx))
					return
				}
				h.HashUint8(1) // indicates visiting slice element
				h.visitStack.push(p.p)
				defer h.visitStack.pop(p.p)
				hashElemDefault(h, p)
			}
		}
	}

	return func(h *hasher, p pointer) {
		pa := p.sliceArray()
		if pa.isNil() {
			h.HashUint8(0) // indicates nil
			return
		}
		once.Do(init)
		h.HashUint8(1) // indicates visiting slice
		n := p.sliceLen()
		h.HashUint64(uint64(n))
		for i := 0; i < n; i++ {
			pe := pa.arrayIndex(i, nb)
			hashElem(h, pe)
		}
	}
}

func keepAllStructFields(keepField reflect.StructField) bool { return true }

func makeStructHasher(t reflect.Type, keepField func(reflect.StructField) bool) typeHasherFunc {
	type fieldHasher struct {
		idx    int // index of field for reflect.Type.Field(n); negative if memory is directly hashable
		keep   bool
		hash   typeHasherFunc // only valid if idx is not negative
		offset uintptr
		size   uintptr
	}
	var once sync.Once
	var fields []fieldHasher
	init := func() {
		for i, numField := 0, t.NumField(); i < numField; i++ {
			sf := t.Field(i)
			f := fieldHasher{i, keepField(sf), nil, sf.Offset, sf.Type.Size()}
			if f.keep && typeIsMemHashable(sf.Type) {
				f.idx = -1
			}

			// Combine with previous field if both contiguous and mem-hashable.
			if f.idx < 0 && len(fields) > 0 {
				if last := &fields[len(fields)-1]; last.idx < 0 && last.offset+last.size == f.offset {
					last.size += f.size
					continue
				}
			}
			fields = append(fields, f)
		}

		for i, f := range fields {
			if f.idx >= 0 {
				fields[i].hash = lookupTypeHasher(t.Field(f.idx).Type)
			}
		}
	}

	return func(h *hasher, p pointer) {
		once.Do(init)
		for _, field := range fields {
			if !field.keep {
				continue
			}
			pf := p.structField(field.idx, field.offset, field.size)
			if field.idx < 0 {
				h.HashBytes(pf.asMemory(field.size))
			} else {
				field.hash(h, pf)
			}
		}
	}
}

func makeMapHasher(t reflect.Type) typeHasherFunc {
	var once sync.Once
	var hashKey, hashValue typeHasherFunc
	var isRecursive bool
	init := func() {
		hashKey = lookupTypeHasher(t.Key())
		hashValue = lookupTypeHasher(t.Elem())
		isRecursive = typeIsRecursive(t)
	}

	return func(h *hasher, p pointer) {
		v := p.asValue(t).Elem() // reflect.Map kind
		if v.IsNil() {
			h.HashUint8(0) // indicates nil
			return
		}
		once.Do(init)
		if isRecursive {
			pm := v.UnsafePointer() // underlying pointer of map
			if idx, ok := h.visitStack.seen(pm); ok {
				h.HashUint8(2) // indicates cycle
				h.HashUint64(uint64(idx))
				return
			}
			h.visitStack.push(pm)
			defer h.visitStack.pop(pm)
		}
		h.HashUint8(1) // indicates visiting map entries
		h.HashUint64(uint64(v.Len()))

		mh := mapHasherPool.Get().(*mapHasher)
		defer mapHasherPool.Put(mh)

		// Hash a map in a sort-free manner.
		// It relies on a map being a an unordered set of KV entries.
		// So long as we hash each KV entry together, we can XOR all the
		// individual hashes to produce a unique hash for the entire map.
		k := mh.valKey.get(v.Type().Key())
		e := mh.valElem.get(v.Type().Elem())
		mh.sum = Sum{}
		mh.h.visitStack = h.visitStack // always use the parent's visit stack to avoid cycles
		for iter := v.MapRange(); iter.Next(); {
			k.SetIterKey(iter)
			e.SetIterValue(iter)
			mh.h.reset()
			hashKey(&mh.h, pointerOf(k.Addr()))
			hashValue(&mh.h, pointerOf(e.Addr()))
			mh.sum.xor(mh.h.sum())
		}
		h.HashBytes(mh.sum.sum[:])
	}
}

func makePointerHasher(t reflect.Type) typeHasherFunc {
	var once sync.Once
	var hashElem typeHasherFunc
	var isRecursive bool
	init := func() {
		hashElem = lookupTypeHasher(t.Elem())
		isRecursive = typeIsRecursive(t)
	}
	return func(h *hasher, p pointer) {
		pe := p.pointerElem()
		if pe.isNil() {
			h.HashUint8(0) // indicates nil
			return
		}
		once.Do(init)
		if isRecursive {
			if idx, ok := h.visitStack.seen(pe.p); ok {
				h.HashUint8(2) // indicates cycle
				h.HashUint64(uint64(idx))
				return
			}
			h.visitStack.push(pe.p)
			defer h.visitStack.pop(pe.p)
		}
		h.HashUint8(1) // indicates visiting a pointer element
		hashElem(h, pe)
	}
}

func makeInterfaceHasher(t reflect.Type) typeHasherFunc {
	return func(h *hasher, p pointer) {
		v := p.asValue(t).Elem() // reflect.Interface kind
		if v.IsNil() {
			h.HashUint8(0) // indicates nil
			return
		}
		h.HashUint8(1) // indicates visiting an interface value
		v = v.Elem()
		t := v.Type()
		h.hashType(t)
		va := reflect.New(t).Elem()
		va.Set(v)
		hashElem := lookupTypeHasher(t)
		hashElem(h, pointerOf(va.Addr()))
	}
}

type mapHasher struct {
	h       hasher
	valKey  valueCache
	valElem valueCache
	sum     Sum
}

var mapHasherPool = &sync.Pool{
	New: func() any { return new(mapHasher) },
}

type valueCache map[reflect.Type]reflect.Value

// get returns an addressable reflect.Value for the given type.
func (c *valueCache) get(t reflect.Type) reflect.Value {
	v, ok := (*c)[t]
	if !ok {
		v = reflect.New(t).Elem()
		if *c == nil {
			*c = make(valueCache)
		}
		(*c)[t] = v
	}
	return v
}