2023-03-08 00:22:23 +00:00
|
|
|
// Copyright (c) Tailscale Inc & AUTHORS
|
|
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
// Package sockstatlog provides a logger for capturing network socket stats for debugging.
|
|
|
|
// Stats are collected at a frequency of 10 Hz and logged to disk.
|
|
|
|
// Stats are only uploaded to the log server on demand.
|
2023-03-08 00:22:23 +00:00
|
|
|
package sockstatlog
|
|
|
|
|
|
|
|
import (
|
2023-03-13 23:27:41 +00:00
|
|
|
"context"
|
2023-03-22 20:51:29 +00:00
|
|
|
"crypto/sha256"
|
2023-03-08 00:22:23 +00:00
|
|
|
"encoding/json"
|
2023-03-10 22:33:26 +00:00
|
|
|
"io"
|
2023-03-22 20:51:29 +00:00
|
|
|
"net/http"
|
2023-03-08 00:22:23 +00:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2023-03-27 22:34:56 +01:00
|
|
|
"sync/atomic"
|
2023-03-08 00:22:23 +00:00
|
|
|
"time"
|
|
|
|
|
2023-03-22 20:51:29 +00:00
|
|
|
"tailscale.com/logpolicy"
|
|
|
|
"tailscale.com/logtail"
|
2023-03-08 00:22:23 +00:00
|
|
|
"tailscale.com/logtail/filch"
|
2023-04-18 00:01:41 +01:00
|
|
|
"tailscale.com/net/netmon"
|
2023-03-08 00:22:23 +00:00
|
|
|
"tailscale.com/net/sockstats"
|
2023-03-10 22:33:26 +00:00
|
|
|
"tailscale.com/types/logger"
|
2023-03-22 20:51:29 +00:00
|
|
|
"tailscale.com/types/logid"
|
2023-03-08 00:22:23 +00:00
|
|
|
"tailscale.com/util/mak"
|
|
|
|
)
|
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
// pollInterval specifies how often to poll for socket stats.
|
|
|
|
const pollInterval = time.Second / 10
|
|
|
|
|
|
|
|
// logInterval specifies how often to log sockstat events to disk.
|
|
|
|
// This delay is added to prevent an infinite loop when logs are uploaded,
|
|
|
|
// which itself creates additional sockstat events.
|
2023-04-20 18:53:02 +01:00
|
|
|
const logInterval = 10 * time.Second
|
2023-03-08 00:22:23 +00:00
|
|
|
|
2023-04-12 23:17:37 +01:00
|
|
|
// maxLogFileSize specifies the maximum size of a log file before it is rotated.
|
|
|
|
// Our logs are fairly compact, and we are mostly only looking at a few hours of data.
|
|
|
|
// Combined with the fact that these are often uploaded over cellular connections,
|
|
|
|
// we keep this relatively small.
|
|
|
|
const maxLogFileSize = 5 << 20 // 5 MB
|
|
|
|
|
2023-03-08 00:22:23 +00:00
|
|
|
// Logger logs statistics about network sockets.
|
|
|
|
type Logger struct {
|
2023-03-29 04:27:52 +01:00
|
|
|
// enabled identifies whether the logger is enabled.
|
2023-03-27 22:34:56 +01:00
|
|
|
enabled atomic.Bool
|
|
|
|
|
2023-03-13 23:27:41 +00:00
|
|
|
ctx context.Context
|
|
|
|
cancelFn context.CancelFunc
|
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
// eventCh is used to pass events from the poller to the logger.
|
|
|
|
eventCh chan event
|
|
|
|
|
|
|
|
logf logger.Logf
|
2023-03-22 20:51:29 +00:00
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
// logger is the underlying logtail logger than manages log files on disk
|
|
|
|
// and uploading to the log server.
|
2023-03-22 20:51:29 +00:00
|
|
|
logger *logtail.Logger
|
2023-03-24 01:54:22 +00:00
|
|
|
filch *filch.Filch
|
2023-03-24 04:10:49 +00:00
|
|
|
tr http.RoundTripper
|
2023-03-08 00:22:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// deltaStat represents the bytes transferred during a time period.
|
|
|
|
// The first element is transmitted bytes, the second element is received bytes.
|
|
|
|
type deltaStat [2]uint64
|
|
|
|
|
|
|
|
// event represents the socket stats on a specific interface during a time period.
|
|
|
|
type event struct {
|
|
|
|
// Time is when the event started as a Unix timestamp in milliseconds.
|
|
|
|
Time int64 `json:"t"`
|
|
|
|
|
|
|
|
// Duration is the duration of this event in milliseconds.
|
|
|
|
Duration int64 `json:"d"`
|
|
|
|
|
|
|
|
// IsCellularInterface is set to 1 if the traffic was sent over a cellular interface.
|
|
|
|
IsCellularInterface int `json:"c,omitempty"`
|
|
|
|
|
|
|
|
// Stats records the stats for each Label during the time period.
|
|
|
|
Stats map[sockstats.Label]deltaStat `json:"s"`
|
|
|
|
}
|
|
|
|
|
2023-03-22 20:51:29 +00:00
|
|
|
// SockstatLogID reproducibly derives a new logid.PrivateID for sockstat logging from a node's public backend log ID.
|
2023-03-23 17:49:56 +00:00
|
|
|
// The returned PrivateID is the sha256 sum of logID + "sockstat".
|
2023-03-22 20:51:29 +00:00
|
|
|
// If a node's public log ID becomes known, it is trivial to spoof sockstat logs for that node.
|
2023-04-20 18:53:02 +01:00
|
|
|
// Given that this is just for debugging, we're not too concerned about that.
|
2023-03-23 17:49:56 +00:00
|
|
|
func SockstatLogID(logID logid.PublicID) logid.PrivateID {
|
|
|
|
return logid.PrivateID(sha256.Sum256([]byte(logID.String() + "sockstat")))
|
2023-03-22 20:51:29 +00:00
|
|
|
}
|
|
|
|
|
2023-03-08 00:22:23 +00:00
|
|
|
// NewLogger returns a new Logger that will store stats in logdir.
|
|
|
|
// On platforms that do not support sockstat logging, a nil Logger will be returned.
|
2023-03-27 22:34:56 +01:00
|
|
|
// The returned Logger is not yet enabled, and must be shut down with Shutdown when it is no longer needed.
|
2023-03-29 04:27:52 +01:00
|
|
|
// Logs will be uploaded to the log server using a new log ID derived from the provided backend logID.
|
2023-04-18 00:01:41 +01:00
|
|
|
// The netMon parameter is optional; if non-nil it's used to do faster interface lookups.
|
|
|
|
func NewLogger(logdir string, logf logger.Logf, logID logid.PublicID, netMon *netmon.Monitor) (*Logger, error) {
|
2023-03-08 00:22:23 +00:00
|
|
|
if !sockstats.IsAvailable {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := os.MkdirAll(logdir, 0755); err != nil && !os.IsExist(err) {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
filchPrefix := filepath.Join(logdir, "sockstats")
|
2023-04-12 23:17:37 +01:00
|
|
|
filch, err := filch.New(filchPrefix, filch.Options{
|
|
|
|
MaxFileSize: maxLogFileSize,
|
|
|
|
ReplaceStderr: false,
|
|
|
|
})
|
2023-03-08 00:22:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2023-03-13 23:50:37 +00:00
|
|
|
logger := &Logger{
|
2023-03-29 04:27:52 +01:00
|
|
|
logf: logf,
|
|
|
|
filch: filch,
|
2023-07-10 20:45:57 +01:00
|
|
|
tr: logpolicy.NewLogtailTransport(logtail.DefaultHost, netMon, logf),
|
2023-03-13 23:50:37 +00:00
|
|
|
}
|
2023-03-22 20:51:29 +00:00
|
|
|
logger.logger = logtail.NewLogger(logtail.Config{
|
2024-03-21 19:20:38 +00:00
|
|
|
BaseURL: logpolicy.LogURL(),
|
|
|
|
PrivateID: SockstatLogID(logID),
|
|
|
|
Collection: "sockstats.log.tailscale.io",
|
|
|
|
Buffer: filch,
|
|
|
|
CompressLogs: true,
|
2023-03-22 20:51:29 +00:00
|
|
|
FlushDelayFn: func() time.Duration {
|
|
|
|
// set flush delay to 100 years so it never flushes automatically
|
|
|
|
return 100 * 365 * 24 * time.Hour
|
|
|
|
},
|
|
|
|
Stderr: io.Discard, // don't log to stderr
|
|
|
|
|
2023-03-24 01:54:22 +00:00
|
|
|
HTTPC: &http.Client{Transport: logger.tr},
|
2023-03-22 20:51:29 +00:00
|
|
|
}, logf)
|
2023-03-29 22:28:30 +01:00
|
|
|
logger.logger.SetSockstatsLabel(sockstats.LabelSockstatlogLogger)
|
2023-03-13 23:50:37 +00:00
|
|
|
|
|
|
|
return logger, nil
|
2023-03-08 00:22:23 +00:00
|
|
|
}
|
|
|
|
|
2023-03-27 22:34:56 +01:00
|
|
|
// SetLoggingEnabled enables or disables logging.
|
|
|
|
// When disabled, socket stats are not polled and no new logs are written to disk.
|
|
|
|
// Existing logs can still be fetched via the C2N API.
|
|
|
|
func (l *Logger) SetLoggingEnabled(v bool) {
|
|
|
|
old := l.enabled.Load()
|
|
|
|
if old != v && l.enabled.CompareAndSwap(old, v) {
|
|
|
|
if v {
|
2023-03-29 04:27:52 +01:00
|
|
|
if l.eventCh == nil {
|
|
|
|
// eventCh should be large enough for the number of events that will occur within logInterval.
|
|
|
|
// Add an extra second's worth of events to ensure we don't drop any.
|
|
|
|
l.eventCh = make(chan event, (logInterval+time.Second)/pollInterval)
|
|
|
|
}
|
2023-03-27 22:34:56 +01:00
|
|
|
l.ctx, l.cancelFn = context.WithCancel(context.Background())
|
|
|
|
go l.poll()
|
2023-03-29 04:27:52 +01:00
|
|
|
go l.logEvents()
|
2023-03-27 22:34:56 +01:00
|
|
|
} else {
|
|
|
|
l.cancelFn()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-24 04:10:49 +00:00
|
|
|
func (l *Logger) Write(p []byte) (int, error) {
|
|
|
|
return l.logger.Write(p)
|
|
|
|
}
|
|
|
|
|
2023-03-08 00:22:23 +00:00
|
|
|
// poll fetches the current socket stats at the configured time interval,
|
2023-03-29 04:27:52 +01:00
|
|
|
// calculates the delta since the last poll,
|
|
|
|
// and writes any non-zero values to the logger event channel.
|
2023-03-08 00:22:23 +00:00
|
|
|
// This method does not return.
|
|
|
|
func (l *Logger) poll() {
|
|
|
|
// last is the last set of socket stats we saw.
|
|
|
|
var lastStats *sockstats.SockStats
|
|
|
|
var lastTime time.Time
|
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
ticker := time.NewTicker(pollInterval)
|
2023-03-13 23:27:41 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-l.ctx.Done():
|
2023-03-29 04:27:52 +01:00
|
|
|
ticker.Stop()
|
2023-03-13 23:27:41 +00:00
|
|
|
return
|
2023-03-29 04:27:52 +01:00
|
|
|
case t := <-ticker.C:
|
2023-03-13 23:27:41 +00:00
|
|
|
stats := sockstats.Get()
|
|
|
|
if lastStats != nil {
|
|
|
|
diffstats := delta(lastStats, stats)
|
|
|
|
if len(diffstats) > 0 {
|
|
|
|
e := event{
|
|
|
|
Time: lastTime.UnixMilli(),
|
|
|
|
Duration: t.Sub(lastTime).Milliseconds(),
|
|
|
|
Stats: diffstats,
|
|
|
|
}
|
|
|
|
if stats.CurrentInterfaceCellular {
|
|
|
|
e.IsCellularInterface = 1
|
|
|
|
}
|
2023-03-29 04:27:52 +01:00
|
|
|
l.eventCh <- e
|
2023-03-10 22:33:26 +00:00
|
|
|
}
|
2023-03-08 00:22:23 +00:00
|
|
|
}
|
2023-03-13 23:27:41 +00:00
|
|
|
lastTime = t
|
|
|
|
lastStats = stats
|
2023-03-08 00:22:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-29 04:27:52 +01:00
|
|
|
// logEvents reads events from the event channel at logInterval and logs them to disk.
|
|
|
|
// This method does not return.
|
|
|
|
func (l *Logger) logEvents() {
|
|
|
|
enc := json.NewEncoder(l)
|
|
|
|
flush := func() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case e := <-l.eventCh:
|
|
|
|
if err := enc.Encode(e); err != nil {
|
|
|
|
l.logf("sockstatlog: error encoding log: %v", err)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ticker := time.NewTicker(logInterval)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-l.ctx.Done():
|
|
|
|
ticker.Stop()
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
|
|
|
flush()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-22 20:51:29 +00:00
|
|
|
func (l *Logger) LogID() string {
|
|
|
|
if l.logger == nil {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
return l.logger.PrivateID().Public().String()
|
2023-03-08 00:22:23 +00:00
|
|
|
}
|
|
|
|
|
2023-03-22 20:51:29 +00:00
|
|
|
// Flush sends pending logs to the log server and flushes them from the local buffer.
|
|
|
|
func (l *Logger) Flush() {
|
|
|
|
l.logger.StartFlush()
|
|
|
|
}
|
2023-03-10 22:33:26 +00:00
|
|
|
|
2023-03-22 20:51:29 +00:00
|
|
|
func (l *Logger) Shutdown() {
|
2023-03-27 22:34:56 +01:00
|
|
|
if l.cancelFn != nil {
|
|
|
|
l.cancelFn()
|
|
|
|
}
|
2023-03-24 01:54:22 +00:00
|
|
|
l.filch.Close()
|
2023-03-24 04:10:49 +00:00
|
|
|
l.logger.Shutdown(context.Background())
|
|
|
|
|
|
|
|
type closeIdler interface {
|
|
|
|
CloseIdleConnections()
|
|
|
|
}
|
|
|
|
if tr, ok := l.tr.(closeIdler); ok {
|
|
|
|
tr.CloseIdleConnections()
|
|
|
|
}
|
2023-03-10 22:33:26 +00:00
|
|
|
}
|
|
|
|
|
2023-03-08 00:22:23 +00:00
|
|
|
// delta calculates the delta stats between two SockStats snapshots.
|
|
|
|
// b is assumed to have occurred after a.
|
|
|
|
// Zero values are omitted from the returned map, and an empty map is returned if no bytes were transferred.
|
|
|
|
func delta(a, b *sockstats.SockStats) (stats map[sockstats.Label]deltaStat) {
|
|
|
|
if a == nil || b == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
for label, bs := range b.Stats {
|
|
|
|
as := a.Stats[label]
|
|
|
|
if as.TxBytes == bs.TxBytes && as.RxBytes == bs.RxBytes {
|
|
|
|
// fast path for unchanged stats
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
mak.Set(&stats, label, deltaStat{bs.TxBytes - as.TxBytes, bs.RxBytes - as.RxBytes})
|
|
|
|
}
|
|
|
|
return stats
|
|
|
|
}
|