validation.go

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package testtrace
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"internal/trace"
    11  	"internal/trace/version"
    12  	"slices"
    13  	"strings"
    14  )
    15  
    16  // Validator is a type used for validating a stream of trace.Events.
    17  type Validator struct {
    18  	lastTs    trace.Time
    19  	gs        map[trace.GoID]*goState
    20  	ps        map[trace.ProcID]*procState
    21  	ms        map[trace.ThreadID]*schedContext
    22  	ranges    map[trace.ResourceID][]string
    23  	tasks     map[trace.TaskID]string
    24  	lastSync  trace.Sync
    25  	GoVersion version.Version
    26  
    27  	// Flags to modify validation behavior.
    28  	skipClockSnapshotChecks bool // Some platforms can't guarantee a monotonically increasing clock reading.
    29  }
    30  
    31  type schedContext struct {
    32  	M trace.ThreadID
    33  	P trace.ProcID
    34  	G trace.GoID
    35  }
    36  
    37  type goState struct {
    38  	state   trace.GoState
    39  	binding *schedContext
    40  }
    41  
    42  type procState struct {
    43  	state   trace.ProcState
    44  	binding *schedContext
    45  }
    46  
    47  // NewValidator creates a new Validator.
    48  func NewValidator() *Validator {
    49  	return &Validator{
    50  		gs:        make(map[trace.GoID]*goState),
    51  		ps:        make(map[trace.ProcID]*procState),
    52  		ms:        make(map[trace.ThreadID]*schedContext),
    53  		ranges:    make(map[trace.ResourceID][]string),
    54  		tasks:     make(map[trace.TaskID]string),
    55  		GoVersion: version.Current,
    56  	}
    57  }
    58  
    59  // SkipClockSnapshotChecks causes the validator to skip checks on the clock snapshots.
    60  //
    61  // Some platforms like Windows, with a small enough trace period, are unable to produce
    62  // monotonically increasing timestamps due to very coarse clock granularity.
    63  func (v *Validator) SkipClockSnapshotChecks() {
    64  	v.skipClockSnapshotChecks = true
    65  }
    66  
    67  // Event validates ev as the next event in a stream of trace.Events.
    68  //
    69  // Returns an error if validation fails.
    70  func (v *Validator) Event(ev trace.Event) error {
    71  	e := new(errAccumulator)
    72  
    73  	// Validate timestamp order.
    74  	if v.lastTs != 0 {
    75  		if ev.Time() <= v.lastTs {
    76  			e.Errorf("timestamp out-of-order (want > %v) for %+v", v.lastTs, ev)
    77  		} else {
    78  			v.lastTs = ev.Time()
    79  		}
    80  	} else {
    81  		v.lastTs = ev.Time()
    82  	}
    83  
    84  	// Validate event stack.
    85  	checkStack(e, ev.Stack())
    86  
    87  	switch ev.Kind() {
    88  	case trace.EventSync:
    89  		s := ev.Sync()
    90  		if s.N != v.lastSync.N+1 {
    91  			e.Errorf("sync count is not sequential: expected %d, got %d", v.lastSync.N+1, s.N)
    92  		}
    93  		// The trace reader currently emits synthetic sync events at the end of
    94  		// a trace. Those don't contain clock snapshots data, so we don't try
    95  		// to validate them.
    96  		//
    97  		// TODO(felixge): Drop the synthetic syncs as discussed in CL 653576.
    98  		if v.GoVersion >= version.Go125 && !(s.N > 1 && s.ClockSnapshot == nil) {
    99  			if s.ClockSnapshot == nil {
   100  				e.Errorf("sync %d has no clock snapshot", s.N)
   101  			} else {
   102  				if s.ClockSnapshot.Wall.IsZero() {
   103  					e.Errorf("sync %d has zero wall time", s.N)
   104  				}
   105  				if s.ClockSnapshot.Mono == 0 {
   106  					e.Errorf("sync %d has zero mono time", s.N)
   107  				}
   108  				if s.ClockSnapshot.Trace == 0 {
   109  					e.Errorf("sync %d has zero trace time", s.N)
   110  				}
   111  				if !v.skipClockSnapshotChecks {
   112  					if s.N >= 2 && !s.ClockSnapshot.Wall.After(v.lastSync.ClockSnapshot.Wall) {
   113  						e.Errorf("sync %d has non-increasing wall time: %v vs %v", s.N, s.ClockSnapshot.Wall, v.lastSync.ClockSnapshot.Wall)
   114  					}
   115  					if s.N >= 2 && !(s.ClockSnapshot.Mono > v.lastSync.ClockSnapshot.Mono) {
   116  						e.Errorf("sync %d has non-increasing mono time: %v vs %v", s.N, s.ClockSnapshot.Mono, v.lastSync.ClockSnapshot.Mono)
   117  					}
   118  					if s.N >= 2 && !(s.ClockSnapshot.Trace > v.lastSync.ClockSnapshot.Trace) {
   119  						e.Errorf("sync %d has non-increasing trace time: %v vs %v", s.N, s.ClockSnapshot.Trace, v.lastSync.ClockSnapshot.Trace)
   120  					}
   121  				}
   122  			}
   123  		}
   124  		v.lastSync = s
   125  	case trace.EventMetric:
   126  		m := ev.Metric()
   127  		if !strings.Contains(m.Name, ":") {
   128  			// Should have a ":" as per runtime/metrics convention.
   129  			e.Errorf("invalid metric name %q", m.Name)
   130  		}
   131  		// Make sure the value is OK.
   132  		if m.Value.Kind() == trace.ValueBad {
   133  			e.Errorf("invalid value")
   134  		}
   135  		switch m.Value.Kind() {
   136  		case trace.ValueUint64:
   137  			// Just make sure it doesn't panic.
   138  			_ = m.Value.Uint64()
   139  		}
   140  	case trace.EventLabel:
   141  		l := ev.Label()
   142  
   143  		// Check label.
   144  		if l.Label == "" {
   145  			e.Errorf("invalid label %q", l.Label)
   146  		}
   147  
   148  		// Check label resource.
   149  		if l.Resource.Kind == trace.ResourceNone {
   150  			e.Errorf("label resource none")
   151  		}
   152  		switch l.Resource.Kind {
   153  		case trace.ResourceGoroutine:
   154  			id := l.Resource.Goroutine()
   155  			if _, ok := v.gs[id]; !ok {
   156  				e.Errorf("label for invalid goroutine %d", id)
   157  			}
   158  		case trace.ResourceProc:
   159  			id := l.Resource.Proc()
   160  			if _, ok := v.ps[id]; !ok {
   161  				e.Errorf("label for invalid proc %d", id)
   162  			}
   163  		case trace.ResourceThread:
   164  			id := l.Resource.Thread()
   165  			if _, ok := v.ms[id]; !ok {
   166  				e.Errorf("label for invalid thread %d", id)
   167  			}
   168  		}
   169  	case trace.EventStackSample:
   170  		// Not much to check here. It's basically a sched context and a stack.
   171  		// The sched context is also not guaranteed to align with other events.
   172  		// We already checked the stack above.
   173  	case trace.EventStateTransition:
   174  		// Validate state transitions.
   175  		//
   176  		// TODO(mknyszek): A lot of logic is duplicated between goroutines and procs.
   177  		// The two are intentionally handled identically; from the perspective of the
   178  		// API, resources all have the same general properties. Consider making this
   179  		// code generic over resources and implementing validation just once.
   180  		tr := ev.StateTransition()
   181  		checkStack(e, tr.Stack)
   182  		switch tr.Resource.Kind {
   183  		case trace.ResourceGoroutine:
   184  			// Basic state transition validation.
   185  			id := tr.Resource.Goroutine()
   186  			old, new := tr.Goroutine()
   187  			if new == trace.GoUndetermined {
   188  				e.Errorf("transition to undetermined state for goroutine %d", id)
   189  			}
   190  			if v.lastSync.N > 1 && old == trace.GoUndetermined {
   191  				e.Errorf("undetermined goroutine %d after first global sync", id)
   192  			}
   193  			if new == trace.GoNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
   194  				e.Errorf("goroutine %d died with active ranges", id)
   195  			}
   196  			state, ok := v.gs[id]
   197  			if ok {
   198  				if old != state.state {
   199  					e.Errorf("bad old state for goroutine %d: got %s, want %s", id, old, state.state)
   200  				}
   201  				state.state = new
   202  			} else {
   203  				if old != trace.GoUndetermined && old != trace.GoNotExist {
   204  					e.Errorf("bad old state for unregistered goroutine %d: %s", id, old)
   205  				}
   206  				state = &goState{state: new}
   207  				v.gs[id] = state
   208  			}
   209  			// Validate sched context.
   210  			if new.Executing() {
   211  				ctx := v.getOrCreateThread(e, ev, ev.Thread())
   212  				if ctx != nil {
   213  					if ctx.G != trace.NoGoroutine && ctx.G != id {
   214  						e.Errorf("tried to run goroutine %d when one was already executing (%d) on thread %d", id, ctx.G, ev.Thread())
   215  					}
   216  					ctx.G = id
   217  					state.binding = ctx
   218  				}
   219  			} else if old.Executing() && !new.Executing() {
   220  				if tr.Stack != ev.Stack() {
   221  					// This is a case where the transition is happening to a goroutine that is also executing, so
   222  					// these two stacks should always match.
   223  					e.Errorf("StateTransition.Stack doesn't match Event.Stack")
   224  				}
   225  				ctx := state.binding
   226  				if ctx != nil {
   227  					if ctx.G != id {
   228  						e.Errorf("tried to stop goroutine %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.G, ev.Thread())
   229  					}
   230  					ctx.G = trace.NoGoroutine
   231  					state.binding = nil
   232  				} else {
   233  					e.Errorf("stopping goroutine %d not bound to any active context", id)
   234  				}
   235  			}
   236  		case trace.ResourceProc:
   237  			// Basic state transition validation.
   238  			id := tr.Resource.Proc()
   239  			old, new := tr.Proc()
   240  			if new == trace.ProcUndetermined {
   241  				e.Errorf("transition to undetermined state for proc %d", id)
   242  			}
   243  			if v.lastSync.N > 1 && old == trace.ProcUndetermined {
   244  				e.Errorf("undetermined proc %d after first global sync", id)
   245  			}
   246  			if new == trace.ProcNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
   247  				e.Errorf("proc %d died with active ranges", id)
   248  			}
   249  			state, ok := v.ps[id]
   250  			if ok {
   251  				if old != state.state {
   252  					e.Errorf("bad old state for proc %d: got %s, want %s", id, old, state.state)
   253  				}
   254  				state.state = new
   255  			} else {
   256  				if old != trace.ProcUndetermined && old != trace.ProcNotExist {
   257  					e.Errorf("bad old state for unregistered proc %d: %s", id, old)
   258  				}
   259  				state = &procState{state: new}
   260  				v.ps[id] = state
   261  			}
   262  			// Validate sched context.
   263  			if new.Executing() {
   264  				ctx := v.getOrCreateThread(e, ev, ev.Thread())
   265  				if ctx != nil {
   266  					if ctx.P != trace.NoProc && ctx.P != id {
   267  						e.Errorf("tried to run proc %d when one was already executing (%d) on thread %d", id, ctx.P, ev.Thread())
   268  					}
   269  					ctx.P = id
   270  					state.binding = ctx
   271  				}
   272  			} else if old.Executing() && !new.Executing() {
   273  				ctx := state.binding
   274  				if ctx != nil {
   275  					if ctx.P != id {
   276  						e.Errorf("tried to stop proc %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.P, ctx.M)
   277  					}
   278  					ctx.P = trace.NoProc
   279  					state.binding = nil
   280  				} else {
   281  					e.Errorf("stopping proc %d not bound to any active context", id)
   282  				}
   283  			}
   284  		}
   285  	case trace.EventRangeBegin, trace.EventRangeActive, trace.EventRangeEnd:
   286  		// Validate ranges.
   287  		r := ev.Range()
   288  		switch ev.Kind() {
   289  		case trace.EventRangeBegin:
   290  			if v.hasRange(r.Scope, r.Name) {
   291  				e.Errorf("already active range %q on %v begun again", r.Name, r.Scope)
   292  			}
   293  			v.addRange(r.Scope, r.Name)
   294  		case trace.EventRangeActive:
   295  			if !v.hasRange(r.Scope, r.Name) {
   296  				v.addRange(r.Scope, r.Name)
   297  			}
   298  		case trace.EventRangeEnd:
   299  			if !v.hasRange(r.Scope, r.Name) {
   300  				e.Errorf("inactive range %q on %v ended", r.Name, r.Scope)
   301  			}
   302  			v.deleteRange(r.Scope, r.Name)
   303  		}
   304  	case trace.EventTaskBegin:
   305  		// Validate task begin.
   306  		t := ev.Task()
   307  		if t.ID == trace.NoTask || t.ID == trace.BackgroundTask {
   308  			// The background task should never have an event emitted for it.
   309  			e.Errorf("found invalid task ID for task of type %s", t.Type)
   310  		}
   311  		if t.Parent == trace.BackgroundTask {
   312  			// It's not possible for a task to be a subtask of the background task.
   313  			e.Errorf("found background task as the parent for task of type %s", t.Type)
   314  		}
   315  		// N.B. Don't check the task type. Empty string is a valid task type.
   316  		v.tasks[t.ID] = t.Type
   317  	case trace.EventTaskEnd:
   318  		// Validate task end.
   319  		// We can see a task end without a begin, so ignore a task without information.
   320  		// Instead, if we've seen the task begin, just make sure the task end lines up.
   321  		t := ev.Task()
   322  		if typ, ok := v.tasks[t.ID]; ok {
   323  			if t.Type != typ {
   324  				e.Errorf("task end type %q doesn't match task start type %q for task %d", t.Type, typ, t.ID)
   325  			}
   326  			delete(v.tasks, t.ID)
   327  		}
   328  	case trace.EventLog:
   329  		// There's really not much here to check, except that we can
   330  		// generate a Log. The category and message are entirely user-created,
   331  		// so we can't make any assumptions as to what they are. We also
   332  		// can't validate the task, because proving the task's existence is very
   333  		// much best-effort.
   334  		_ = ev.Log()
   335  	}
   336  	return e.Errors()
   337  }
   338  
   339  func (v *Validator) hasRange(r trace.ResourceID, name string) bool {
   340  	ranges, ok := v.ranges[r]
   341  	return ok && slices.Contains(ranges, name)
   342  }
   343  
   344  func (v *Validator) addRange(r trace.ResourceID, name string) {
   345  	ranges, _ := v.ranges[r]
   346  	ranges = append(ranges, name)
   347  	v.ranges[r] = ranges
   348  }
   349  
   350  func (v *Validator) hasAnyRange(r trace.ResourceID) bool {
   351  	ranges, ok := v.ranges[r]
   352  	return ok && len(ranges) != 0
   353  }
   354  
   355  func (v *Validator) deleteRange(r trace.ResourceID, name string) {
   356  	ranges, ok := v.ranges[r]
   357  	if !ok {
   358  		return
   359  	}
   360  	i := slices.Index(ranges, name)
   361  	if i < 0 {
   362  		return
   363  	}
   364  	v.ranges[r] = slices.Delete(ranges, i, i+1)
   365  }
   366  
   367  func (v *Validator) getOrCreateThread(e *errAccumulator, ev trace.Event, m trace.ThreadID) *schedContext {
   368  	lenient := func() bool {
   369  		// Be lenient about GoUndetermined -> GoSyscall transitions if they
   370  		// originate from an old trace. These transitions lack thread
   371  		// information in trace formats older than 1.22.
   372  		if v.GoVersion >= version.Go122 {
   373  			return false
   374  		}
   375  		if ev.Kind() != trace.EventStateTransition {
   376  			return false
   377  		}
   378  		tr := ev.StateTransition()
   379  		if tr.Resource.Kind != trace.ResourceGoroutine {
   380  			return false
   381  		}
   382  		from, to := tr.Goroutine()
   383  		return from == trace.GoUndetermined && to == trace.GoSyscall
   384  	}
   385  	if m == trace.NoThread && !lenient() {
   386  		e.Errorf("must have thread, but thread ID is none")
   387  		return nil
   388  	}
   389  	s, ok := v.ms[m]
   390  	if !ok {
   391  		s = &schedContext{M: m, P: trace.NoProc, G: trace.NoGoroutine}
   392  		v.ms[m] = s
   393  		return s
   394  	}
   395  	return s
   396  }
   397  
   398  func checkStack(e *errAccumulator, stk trace.Stack) {
   399  	// Check for non-empty values, but we also check for crashes due to incorrect validation.
   400  	for i, f := range slices.Collect(stk.Frames()) {
   401  		if i == 0 {
   402  			// Allow for one fully zero stack.
   403  			//
   404  			// TODO(mknyszek): Investigate why that happens.
   405  			continue
   406  		}
   407  		if f.Func == "" || f.File == "" || f.PC == 0 || f.Line == 0 {
   408  			e.Errorf("invalid stack frame %#v: missing information", f)
   409  		}
   410  	}
   411  }
   412  
   413  type errAccumulator struct {
   414  	errs []error
   415  }
   416  
   417  func (e *errAccumulator) Errorf(f string, args ...any) {
   418  	e.errs = append(e.errs, fmt.Errorf(f, args...))
   419  }
   420  
   421  func (e *errAccumulator) Errors() error {
   422  	return errors.Join(e.errs...)
   423  }
   424
View as plain text