trace2.go

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.exectracer2
     6  
     7  // Go execution tracer.
     8  // The tracer captures a wide range of execution events like goroutine
     9  // creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
    10  // changes of heap size, processor start/stop, etc and writes them to a buffer
    11  // in a compact form. A precise nanosecond-precision timestamp and a stack
    12  // trace is captured for most events.
    13  //
    14  // Tracer invariants (to keep the synchronization making sense):
    15  // - An m that has a trace buffer must be on either the allm or sched.freem lists.
    16  // - Any trace buffer mutation must either be happening in traceAdvance or between
    17  //   a traceAcquire and a subsequent traceRelease.
    18  // - traceAdvance cannot return until the previous generation's buffers are all flushed.
    19  //
    20  // See https://go.dev/issue/60773 for a link to the full design.
    21  
    22  package runtime
    23  
    24  import (
    25  	"runtime/internal/atomic"
    26  	"unsafe"
    27  )
    28  
    29  // Trace state.
    30  
    31  // trace is global tracing context.
    32  var trace struct {
    33  	// trace.lock must only be acquired on the system stack where
    34  	// stack splits cannot happen while it is held.
    35  	lock mutex
    36  
    37  	// Trace buffer management.
    38  	//
    39  	// First we check the empty list for any free buffers. If not, buffers
    40  	// are allocated directly from the OS. Once they're filled up and/or
    41  	// flushed, they end up on the full queue for trace.gen%2.
    42  	//
    43  	// The trace reader takes buffers off the full list one-by-one and
    44  	// places them into reading until they're finished being read from.
    45  	// Then they're placed onto the empty list.
    46  	//
    47  	// Protected by trace.lock.
    48  	reading       *traceBuf // buffer currently handed off to user
    49  	empty         *traceBuf // stack of empty buffers
    50  	full          [2]traceBufQueue
    51  	workAvailable atomic.Bool
    52  
    53  	// State for the trace reader goroutine.
    54  	//
    55  	// Protected by trace.lock.
    56  	readerGen     atomic.Uintptr // the generation the reader is currently reading for
    57  	flushedGen    atomic.Uintptr // the last completed generation
    58  	headerWritten bool           // whether ReadTrace has emitted trace header
    59  
    60  	// doneSema is used to synchronize the reader and traceAdvance. Specifically,
    61  	// it notifies traceAdvance that the reader is done with a generation.
    62  	// Both semaphores are 0 by default (so, acquires block). traceAdvance
    63  	// attempts to acquire for gen%2 after flushing the last buffers for gen.
    64  	// Meanwhile the reader releases the sema for gen%2 when it has finished
    65  	// processing gen.
    66  	doneSema [2]uint32
    67  
    68  	// Trace data tables for deduplicating data going into the trace.
    69  	// There are 2 of each: one for gen%2, one for 1-gen%2.
    70  	stackTab  [2]traceStackTable  // maps stack traces to unique ids
    71  	stringTab [2]traceStringTable // maps strings to unique ids
    72  
    73  	// cpuLogRead accepts CPU profile samples from the signal handler where
    74  	// they're generated. There are two profBufs here: one for gen%2, one for
    75  	// 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G,
    76  	// and M (respectively) that were active at the time of the sample. Because
    77  	// profBuf uses a record with all zeros in its header to indicate overflow,
    78  	// we make sure to make the P field always non-zero: The ID of a real P will
    79  	// start at bit 1, and bit 0 will be set. Samples that arrive while no P is
    80  	// running (such as near syscalls) will set the first header field to 0b10.
    81  	// This careful handling of the first header field allows us to store ID of
    82  	// the active G directly in the second field, even though that will be 0
    83  	// when sampling g0.
    84  	//
    85  	// Initialization and teardown of these fields is protected by traceAdvanceSema.
    86  	cpuLogRead  [2]*profBuf
    87  	signalLock  atomic.Uint32              // protects use of the following member, only usable in signal handlers
    88  	cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock
    89  	cpuSleep    *wakeableSleep
    90  	cpuLogDone  <-chan struct{}
    91  	cpuBuf      [2]*traceBuf
    92  
    93  	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
    94  
    95  	// Fast mappings from enumerations to string IDs that are prepopulated
    96  	// in the trace.
    97  	markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg
    98  	goStopReasons    [2][len(traceGoStopReasonStrings)]traceArg
    99  	goBlockReasons   [2][len(traceBlockReasonStrings)]traceArg
   100  
   101  	// Trace generation counter.
   102  	gen            atomic.Uintptr
   103  	lastNonZeroGen uintptr // last non-zero value of gen
   104  
   105  	// shutdown is set when we are waiting for trace reader to finish after setting gen to 0
   106  	//
   107  	// Writes protected by trace.lock.
   108  	shutdown atomic.Bool
   109  
   110  	// Number of goroutines in syscall exiting slow path.
   111  	exitingSyscall atomic.Int32
   112  
   113  	// seqGC is the sequence counter for GC begin/end.
   114  	//
   115  	// Mutated only during stop-the-world.
   116  	seqGC uint64
   117  }
   118  
   119  // Trace public API.
   120  
   121  var (
   122  	traceAdvanceSema  uint32 = 1
   123  	traceShutdownSema uint32 = 1
   124  )
   125  
   126  // StartTrace enables tracing for the current process.
   127  // While tracing, the data will be buffered and available via [ReadTrace].
   128  // StartTrace returns an error if tracing is already enabled.
   129  // Most clients should use the [runtime/trace] package or the [testing] package's
   130  // -test.trace flag instead of calling StartTrace directly.
   131  func StartTrace() error {
   132  	if traceEnabled() || traceShuttingDown() {
   133  		return errorString("tracing is already enabled")
   134  	}
   135  	// Block until cleanup of the last trace is done.
   136  	semacquire(&traceShutdownSema)
   137  	semrelease(&traceShutdownSema)
   138  
   139  	// Hold traceAdvanceSema across trace start, since we'll want it on
   140  	// the other side of tracing being enabled globally.
   141  	semacquire(&traceAdvanceSema)
   142  
   143  	// Initialize CPU profile -> trace ingestion.
   144  	traceInitReadCPU()
   145  
   146  	// Compute the first generation for this StartTrace.
   147  	//
   148  	// Note: we start from the last non-zero generation rather than 1 so we
   149  	// can avoid resetting all the arrays indexed by gen%2 or gen%3. There's
   150  	// more than one of each per m, p, and goroutine.
   151  	firstGen := traceNextGen(trace.lastNonZeroGen)
   152  
   153  	// Reset GC sequencer.
   154  	trace.seqGC = 1
   155  
   156  	// Reset trace reader state.
   157  	trace.headerWritten = false
   158  	trace.readerGen.Store(firstGen)
   159  	trace.flushedGen.Store(0)
   160  
   161  	// Register some basic strings in the string tables.
   162  	traceRegisterLabelsAndReasons(firstGen)
   163  
   164  	// Stop the world.
   165  	//
   166  	// The purpose of stopping the world is to make sure that no goroutine is in a
   167  	// context where it could emit an event by bringing all goroutines to a safe point
   168  	// with no opportunity to transition.
   169  	//
   170  	// The exception to this rule are goroutines that are concurrently exiting a syscall.
   171  	// Those will all be forced into the syscalling slow path, and we'll just make sure
   172  	// that we don't observe any goroutines in that critical section before starting
   173  	// the world again.
   174  	//
   175  	// A good follow-up question to this is why stopping the world is necessary at all
   176  	// given that we have traceAcquire and traceRelease. Unfortunately, those only help
   177  	// us when tracing is already active (for performance, so when tracing is off the
   178  	// tracing seqlock is left untouched). The main issue here is subtle: we're going to
   179  	// want to obtain a correct starting status for each goroutine, but there are windows
   180  	// of time in which we could read and emit an incorrect status. Specifically:
   181  	//
   182  	//	trace := traceAcquire()
   183  	//  // <----> problem window
   184  	//	casgstatus(gp, _Gwaiting, _Grunnable)
   185  	//	if trace.ok() {
   186  	//		trace.GoUnpark(gp, 2)
   187  	//		traceRelease(trace)
   188  	//	}
   189  	//
   190  	// More precisely, if we readgstatus for a gp while another goroutine is in the problem
   191  	// window and that goroutine didn't observe that tracing had begun, then we might write
   192  	// a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
   193  	// the transition from GoWaiting to GoRunnable. The trace will then be broken, because
   194  	// future events will be emitted assuming the tracer sees GoRunnable.
   195  	//
   196  	// In short, what we really need here is to make sure that the next time *any goroutine*
   197  	// hits a traceAcquire, it sees that the trace is enabled.
   198  	//
   199  	// Note also that stopping the world is necessary to make sure sweep-related events are
   200  	// coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
   201  	// the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this.
   202  	stw := stopTheWorld(stwStartTrace)
   203  
   204  	// Prevent sysmon from running any code that could generate events.
   205  	lock(&sched.sysmonlock)
   206  
   207  	// Reset mSyscallID on all Ps while we have them stationary and the trace is disabled.
   208  	for _, pp := range allp {
   209  		pp.trace.mSyscallID = -1
   210  	}
   211  
   212  	// Start tracing.
   213  	//
   214  	// After this executes, other Ms may start creating trace buffers and emitting
   215  	// data into them.
   216  	trace.gen.Store(firstGen)
   217  
   218  	// Wait for exitingSyscall to drain.
   219  	//
   220  	// It may not monotonically decrease to zero, but in the limit it will always become
   221  	// zero because the world is stopped and there are no available Ps for syscall-exited
   222  	// goroutines to run on.
   223  	//
   224  	// Because we set gen before checking this, and because exitingSyscall is always incremented
   225  	// *after* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero
   226  	// that any goroutine that goes to exit a syscall from then on *must* observe the new gen.
   227  	//
   228  	// The critical section on each goroutine here is going to be quite short, so the likelihood
   229  	// that we observe a zero value is high.
   230  	for trace.exitingSyscall.Load() != 0 {
   231  		osyield()
   232  	}
   233  
   234  	// Record some initial pieces of information.
   235  	//
   236  	// N.B. This will also emit a status event for this goroutine.
   237  	tl := traceAcquire()
   238  	tl.Gomaxprocs(gomaxprocs)  // Get this as early in the trace as possible. See comment in traceAdvance.
   239  	tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now.
   240  
   241  	// Record the fact that a GC is active, if applicable.
   242  	if gcphase == _GCmark || gcphase == _GCmarktermination {
   243  		tl.GCActive()
   244  	}
   245  
   246  	// Record the heap goal so we have it at the very beginning of the trace.
   247  	tl.HeapGoal()
   248  
   249  	// Make sure a ProcStatus is emitted for every P, while we're here.
   250  	for _, pp := range allp {
   251  		tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end()
   252  	}
   253  	traceRelease(tl)
   254  
   255  	unlock(&sched.sysmonlock)
   256  	startTheWorld(stw)
   257  
   258  	traceStartReadCPU()
   259  	traceAdvancer.start()
   260  
   261  	semrelease(&traceAdvanceSema)
   262  	return nil
   263  }
   264  
   265  // StopTrace stops tracing, if it was previously enabled.
   266  // StopTrace only returns after all the reads for the trace have completed.
   267  func StopTrace() {
   268  	traceAdvance(true)
   269  }
   270  
   271  // traceAdvance moves tracing to the next generation, and cleans up the current generation,
   272  // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing
   273  // altogether instead of advancing to the next generation.
   274  //
   275  // traceAdvanceSema must not be held.
   276  func traceAdvance(stopTrace bool) {
   277  	semacquire(&traceAdvanceSema)
   278  
   279  	// Get the gen that we're advancing from. In this function we don't really care much
   280  	// about the generation we're advancing _into_ since we'll do all the cleanup in this
   281  	// generation for the next advancement.
   282  	gen := trace.gen.Load()
   283  	if gen == 0 {
   284  		// We may end up here traceAdvance is called concurrently with StopTrace.
   285  		semrelease(&traceAdvanceSema)
   286  		return
   287  	}
   288  
   289  	// Write an EvFrequency event for this generation.
   290  	//
   291  	// N.B. This may block for quite a while to get a good frequency estimate, so make sure we do
   292  	// this here and not e.g. on the trace reader.
   293  	traceFrequency(gen)
   294  
   295  	// Collect all the untraced Gs.
   296  	type untracedG struct {
   297  		gp           *g
   298  		goid         uint64
   299  		mid          int64
   300  		status       uint32
   301  		waitreason   waitReason
   302  		inMarkAssist bool
   303  	}
   304  	var untracedGs []untracedG
   305  	forEachGRace(func(gp *g) {
   306  		// Make absolutely sure all Gs are ready for the next
   307  		// generation. We need to do this even for dead Gs because
   308  		// they may come alive with a new identity, and its status
   309  		// traced bookkeeping might end up being stale.
   310  		// We may miss totally new goroutines, but they'll always
   311  		// have clean bookkeeping.
   312  		gp.trace.readyNextGen(gen)
   313  		// If the status was traced, nothing else to do.
   314  		if gp.trace.statusWasTraced(gen) {
   315  			return
   316  		}
   317  		// Scribble down information about this goroutine.
   318  		ug := untracedG{gp: gp, mid: -1}
   319  		systemstack(func() {
   320  			me := getg().m.curg
   321  			// We don't have to handle this G status transition because we
   322  			// already eliminated ourselves from consideration above.
   323  			casGToWaiting(me, _Grunning, waitReasonTraceGoroutineStatus)
   324  			// We need to suspend and take ownership of the G to safely read its
   325  			// goid. Note that we can't actually emit the event at this point
   326  			// because we might stop the G in a window where it's unsafe to write
   327  			// events based on the G's status. We need the global trace buffer flush
   328  			// coming up to make sure we're not racing with the G.
   329  			//
   330  			// It should be very unlikely that we try to preempt a running G here.
   331  			// The only situation that we might is that we're racing with a G
   332  			// that's running for the first time in this generation. Therefore,
   333  			// this should be relatively fast.
   334  			s := suspendG(gp)
   335  			if !s.dead {
   336  				ug.goid = s.g.goid
   337  				if s.g.m != nil {
   338  					ug.mid = int64(s.g.m.procid)
   339  				}
   340  				ug.status = readgstatus(s.g) &^ _Gscan
   341  				ug.waitreason = s.g.waitreason
   342  				ug.inMarkAssist = s.g.inMarkAssist
   343  			}
   344  			resumeG(s)
   345  			casgstatus(me, _Gwaiting, _Grunning)
   346  		})
   347  		if ug.goid != 0 {
   348  			untracedGs = append(untracedGs, ug)
   349  		}
   350  	})
   351  
   352  	if !stopTrace {
   353  		// Re-register runtime goroutine labels and stop/block reasons.
   354  		traceRegisterLabelsAndReasons(traceNextGen(gen))
   355  	}
   356  
   357  	// Now that we've done some of the heavy stuff, prevent the world from stopping.
   358  	// This is necessary to ensure the consistency of the STW events. If we're feeling
   359  	// adventurous we could lift this restriction and add a STWActive event, but the
   360  	// cost of maintaining this consistency is low. We're not going to hold this semaphore
   361  	// for very long and most STW periods are very short.
   362  	// Once we hold worldsema, prevent preemption as well so we're not interrupted partway
   363  	// through this. We want to get this done as soon as possible.
   364  	semacquire(&worldsema)
   365  	mp := acquirem()
   366  
   367  	// Advance the generation or stop the trace.
   368  	trace.lastNonZeroGen = gen
   369  	if stopTrace {
   370  		systemstack(func() {
   371  			// Ordering is important here. Set shutdown first, then disable tracing,
   372  			// so that conditions like (traceEnabled() || traceShuttingDown()) have
   373  			// no opportunity to be false. Hold the trace lock so this update appears
   374  			// atomic to the trace reader.
   375  			lock(&trace.lock)
   376  			trace.shutdown.Store(true)
   377  			trace.gen.Store(0)
   378  			unlock(&trace.lock)
   379  		})
   380  	} else {
   381  		trace.gen.Store(traceNextGen(gen))
   382  	}
   383  
   384  	// Emit a ProcsChange event so we have one on record for each generation.
   385  	// Let's emit it as soon as possible so that downstream tools can rely on the value
   386  	// being there fairly soon in a generation.
   387  	//
   388  	// It's important that we do this before allowing stop-the-worlds again,
   389  	// because the procs count could change.
   390  	if !stopTrace {
   391  		tl := traceAcquire()
   392  		tl.Gomaxprocs(gomaxprocs)
   393  		traceRelease(tl)
   394  	}
   395  
   396  	// Emit a GCActive event in the new generation if necessary.
   397  	//
   398  	// It's important that we do this before allowing stop-the-worlds again,
   399  	// because that could emit global GC-related events.
   400  	if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) {
   401  		tl := traceAcquire()
   402  		tl.GCActive()
   403  		traceRelease(tl)
   404  	}
   405  
   406  	// Preemption is OK again after this. If the world stops or whatever it's fine.
   407  	// We're just cleaning up the last generation after this point.
   408  	//
   409  	// We also don't care if the GC starts again after this for the same reasons.
   410  	releasem(mp)
   411  	semrelease(&worldsema)
   412  
   413  	// Snapshot allm and freem.
   414  	//
   415  	// Snapshotting after the generation counter update is sufficient.
   416  	// Because an m must be on either allm or sched.freem if it has an active trace
   417  	// buffer, new threads added to allm after this point must necessarily observe
   418  	// the new generation number (sched.lock acts as a barrier).
   419  	//
   420  	// Threads that exit before this point and are on neither list explicitly
   421  	// flush their own buffers in traceThreadDestroy.
   422  	//
   423  	// Snapshotting freem is necessary because Ms can continue to emit events
   424  	// while they're still on that list. Removal from sched.freem is serialized with
   425  	// this snapshot, so either we'll capture an m on sched.freem and race with
   426  	// the removal to flush its buffers (resolved by traceThreadDestroy acquiring
   427  	// the thread's seqlock, which one of us must win, so at least its old gen buffer
   428  	// will be flushed in time for the new generation) or it will have flushed its
   429  	// buffers before we snapshotted it to begin with.
   430  	lock(&sched.lock)
   431  	mToFlush := allm
   432  	for mp := mToFlush; mp != nil; mp = mp.alllink {
   433  		mp.trace.link = mp.alllink
   434  	}
   435  	for mp := sched.freem; mp != nil; mp = mp.freelink {
   436  		mp.trace.link = mToFlush
   437  		mToFlush = mp
   438  	}
   439  	unlock(&sched.lock)
   440  
   441  	// Iterate over our snapshot, flushing every buffer until we're done.
   442  	//
   443  	// Because trace writers read the generation while the seqlock is
   444  	// held, we can be certain that when there are no writers there are
   445  	// also no stale generation values left. Therefore, it's safe to flush
   446  	// any buffers that remain in that generation's slot.
   447  	const debugDeadlock = false
   448  	systemstack(func() {
   449  		// Track iterations for some rudimentary deadlock detection.
   450  		i := 0
   451  		detectedDeadlock := false
   452  
   453  		for mToFlush != nil {
   454  			prev := &mToFlush
   455  			for mp := *prev; mp != nil; {
   456  				if mp.trace.seqlock.Load()%2 != 0 {
   457  					// The M is writing. Come back to it later.
   458  					prev = &mp.trace.link
   459  					mp = mp.trace.link
   460  					continue
   461  				}
   462  				// Flush the trace buffer.
   463  				//
   464  				// trace.lock needed for traceBufFlush, but also to synchronize
   465  				// with traceThreadDestroy, which flushes both buffers unconditionally.
   466  				lock(&trace.lock)
   467  				bufp := &mp.trace.buf[gen%2]
   468  				if *bufp != nil {
   469  					traceBufFlush(*bufp, gen)
   470  					*bufp = nil
   471  				}
   472  				unlock(&trace.lock)
   473  
   474  				// Remove the m from the flush list.
   475  				*prev = mp.trace.link
   476  				mp.trace.link = nil
   477  				mp = *prev
   478  			}
   479  			// Yield only if we're going to be going around the loop again.
   480  			if mToFlush != nil {
   481  				osyield()
   482  			}
   483  
   484  			if debugDeadlock {
   485  				// Try to detect a deadlock. We probably shouldn't loop here
   486  				// this many times.
   487  				if i > 100000 && !detectedDeadlock {
   488  					detectedDeadlock = true
   489  					println("runtime: failing to flush")
   490  					for mp := mToFlush; mp != nil; mp = mp.trace.link {
   491  						print("runtime: m=", mp.id, "\n")
   492  					}
   493  				}
   494  				i++
   495  			}
   496  		}
   497  	})
   498  
   499  	// At this point, the old generation is fully flushed minus stack and string
   500  	// tables, CPU samples, and goroutines that haven't run at all during the last
   501  	// generation.
   502  
   503  	// Check to see if any Gs still haven't had events written out for them.
   504  	statusWriter := unsafeTraceWriter(gen, nil)
   505  	for _, ug := range untracedGs {
   506  		if ug.gp.trace.statusWasTraced(gen) {
   507  			// It was traced, we don't need to do anything.
   508  			continue
   509  		}
   510  		// It still wasn't traced. Because we ensured all Ms stopped writing trace
   511  		// events to the last generation, that must mean the G never had its status
   512  		// traced in gen between when we recorded it and now. If that's true, the goid
   513  		// and status we recorded then is exactly what we want right now.
   514  		status := goStatusToTraceGoStatus(ug.status, ug.waitreason)
   515  		statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist)
   516  	}
   517  	statusWriter.flush().end()
   518  
   519  	// Read everything out of the last gen's CPU profile buffer.
   520  	traceReadCPU(gen)
   521  
   522  	systemstack(func() {
   523  		// Flush CPU samples, stacks, and strings for the last generation. This is safe,
   524  		// because we're now certain no M is writing to the last generation.
   525  		//
   526  		// Ordering is important here. traceCPUFlush may generate new stacks and dumping
   527  		// stacks may generate new strings.
   528  		traceCPUFlush(gen)
   529  		trace.stackTab[gen%2].dump(gen)
   530  		trace.stringTab[gen%2].reset(gen)
   531  
   532  		// That's it. This generation is done producing buffers.
   533  		lock(&trace.lock)
   534  		trace.flushedGen.Store(gen)
   535  		unlock(&trace.lock)
   536  	})
   537  
   538  	if stopTrace {
   539  		semacquire(&traceShutdownSema)
   540  
   541  		// Finish off CPU profile reading.
   542  		traceStopReadCPU()
   543  	} else {
   544  		// Go over each P and emit a status event for it if necessary.
   545  		//
   546  		// We do this at the beginning of the new generation instead of the
   547  		// end like we do for goroutines because forEachP doesn't give us a
   548  		// hook to skip Ps that have already been traced. Since we have to
   549  		// preempt all Ps anyway, might as well stay consistent with StartTrace
   550  		// which does this during the STW.
   551  		semacquire(&worldsema)
   552  		forEachP(waitReasonTraceProcStatus, func(pp *p) {
   553  			tl := traceAcquire()
   554  			if !pp.trace.statusWasTraced(tl.gen) {
   555  				tl.writer().writeProcStatusForP(pp, false).end()
   556  			}
   557  			traceRelease(tl)
   558  		})
   559  		// Perform status reset on dead Ps because they just appear as idle.
   560  		//
   561  		// Holding worldsema prevents allp from changing.
   562  		//
   563  		// TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy
   564  		// events to indicate whether a P exists, rather than just making its
   565  		// existence implicit.
   566  		for _, pp := range allp[len(allp):cap(allp)] {
   567  			pp.trace.readyNextGen(traceNextGen(gen))
   568  		}
   569  		semrelease(&worldsema)
   570  	}
   571  
   572  	// Block until the trace reader has finished processing the last generation.
   573  	semacquire(&trace.doneSema[gen%2])
   574  	if raceenabled {
   575  		raceacquire(unsafe.Pointer(&trace.doneSema[gen%2]))
   576  	}
   577  
   578  	// Double-check that things look as we expect after advancing and perform some
   579  	// final cleanup if the trace has fully stopped.
   580  	systemstack(func() {
   581  		lock(&trace.lock)
   582  		if !trace.full[gen%2].empty() {
   583  			throw("trace: non-empty full trace buffer for done generation")
   584  		}
   585  		if stopTrace {
   586  			if !trace.full[1-(gen%2)].empty() {
   587  				throw("trace: non-empty full trace buffer for next generation")
   588  			}
   589  			if trace.reading != nil || trace.reader.Load() != nil {
   590  				throw("trace: reading after shutdown")
   591  			}
   592  			// Free all the empty buffers.
   593  			for trace.empty != nil {
   594  				buf := trace.empty
   595  				trace.empty = buf.link
   596  				sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
   597  			}
   598  			// Clear trace.shutdown and other flags.
   599  			trace.headerWritten = false
   600  			trace.shutdown.Store(false)
   601  		}
   602  		unlock(&trace.lock)
   603  	})
   604  
   605  	if stopTrace {
   606  		// Clear the sweep state on every P for the next time tracing is enabled.
   607  		//
   608  		// It may be stale in the next trace because we may have ended tracing in
   609  		// the middle of a sweep on a P.
   610  		//
   611  		// It's fine not to call forEachP here because tracing is disabled and we
   612  		// know at this point that nothing is calling into the tracer, but we do
   613  		// need to look at dead Ps too just because GOMAXPROCS could have been called
   614  		// at any point since we stopped tracing, and we have to ensure there's no
   615  		// bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that
   616  		// might mutate allp by making ourselves briefly non-preemptible.
   617  		mp := acquirem()
   618  		for _, pp := range allp[:cap(allp)] {
   619  			pp.trace.inSweep = false
   620  			pp.trace.maySweep = false
   621  			pp.trace.swept = 0
   622  			pp.trace.reclaimed = 0
   623  		}
   624  		releasem(mp)
   625  	}
   626  
   627  	// Release the advance semaphore. If stopTrace is true we're still holding onto
   628  	// traceShutdownSema.
   629  	//
   630  	// Do a direct handoff. Don't let one caller of traceAdvance starve
   631  	// other calls to traceAdvance.
   632  	semrelease1(&traceAdvanceSema, true, 0)
   633  
   634  	if stopTrace {
   635  		// Stop the traceAdvancer. We can't be holding traceAdvanceSema here because
   636  		// we'll deadlock (we're blocked on the advancer goroutine exiting, but it
   637  		// may be currently trying to acquire traceAdvanceSema).
   638  		traceAdvancer.stop()
   639  		semrelease(&traceShutdownSema)
   640  	}
   641  }
   642  
   643  func traceNextGen(gen uintptr) uintptr {
   644  	if gen == ^uintptr(0) {
   645  		// gen is used both %2 and %3 and we want both patterns to continue when we loop around.
   646  		// ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation
   647  		// we want is even and one more than a multiple of 3. The smallest such number is 4.
   648  		return 4
   649  	}
   650  	return gen + 1
   651  }
   652  
   653  // traceRegisterLabelsAndReasons re-registers mark worker labels and
   654  // goroutine stop/block reasons in the string table for the provided
   655  // generation. Note: the provided generation must not have started yet.
   656  func traceRegisterLabelsAndReasons(gen uintptr) {
   657  	for i, label := range gcMarkWorkerModeStrings[:] {
   658  		trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label))
   659  	}
   660  	for i, str := range traceBlockReasonStrings[:] {
   661  		trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
   662  	}
   663  	for i, str := range traceGoStopReasonStrings[:] {
   664  		trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
   665  	}
   666  }
   667  
   668  // ReadTrace returns the next chunk of binary tracing data, blocking until data
   669  // is available. If tracing is turned off and all the data accumulated while it
   670  // was on has been returned, ReadTrace returns nil. The caller must copy the
   671  // returned data before calling ReadTrace again.
   672  // ReadTrace must be called from one goroutine at a time.
   673  func ReadTrace() []byte {
   674  top:
   675  	var buf []byte
   676  	var park bool
   677  	systemstack(func() {
   678  		buf, park = readTrace0()
   679  	})
   680  	if park {
   681  		gopark(func(gp *g, _ unsafe.Pointer) bool {
   682  			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
   683  				// We're racing with another reader.
   684  				// Wake up and handle this case.
   685  				return false
   686  			}
   687  
   688  			if g2 := traceReader(); gp == g2 {
   689  				// New data arrived between unlocking
   690  				// and the CAS and we won the wake-up
   691  				// race, so wake up directly.
   692  				return false
   693  			} else if g2 != nil {
   694  				printlock()
   695  				println("runtime: got trace reader", g2, g2.goid)
   696  				throw("unexpected trace reader")
   697  			}
   698  
   699  			return true
   700  		}, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2)
   701  		goto top
   702  	}
   703  
   704  	return buf
   705  }
   706  
   707  // readTrace0 is ReadTrace's continuation on g0. This must run on the
   708  // system stack because it acquires trace.lock.
   709  //
   710  //go:systemstack
   711  func readTrace0() (buf []byte, park bool) {
   712  	if raceenabled {
   713  		// g0 doesn't have a race context. Borrow the user G's.
   714  		if getg().racectx != 0 {
   715  			throw("expected racectx == 0")
   716  		}
   717  		getg().racectx = getg().m.curg.racectx
   718  		// (This defer should get open-coded, which is safe on
   719  		// the system stack.)
   720  		defer func() { getg().racectx = 0 }()
   721  	}
   722  
   723  	// This function must not allocate while holding trace.lock:
   724  	// allocation can call heap allocate, which will try to emit a trace
   725  	// event while holding heap lock.
   726  	lock(&trace.lock)
   727  
   728  	if trace.reader.Load() != nil {
   729  		// More than one goroutine reads trace. This is bad.
   730  		// But we rather do not crash the program because of tracing,
   731  		// because tracing can be enabled at runtime on prod servers.
   732  		unlock(&trace.lock)
   733  		println("runtime: ReadTrace called from multiple goroutines simultaneously")
   734  		return nil, false
   735  	}
   736  	// Recycle the old buffer.
   737  	if buf := trace.reading; buf != nil {
   738  		buf.link = trace.empty
   739  		trace.empty = buf
   740  		trace.reading = nil
   741  	}
   742  	// Write trace header.
   743  	if !trace.headerWritten {
   744  		trace.headerWritten = true
   745  		unlock(&trace.lock)
   746  		return []byte("go 1.22 trace\x00\x00\x00"), false
   747  	}
   748  
   749  	// Read the next buffer.
   750  
   751  	if trace.readerGen.Load() == 0 {
   752  		trace.readerGen.Store(1)
   753  	}
   754  	var gen uintptr
   755  	for {
   756  		assertLockHeld(&trace.lock)
   757  		gen = trace.readerGen.Load()
   758  
   759  		// Check to see if we need to block for more data in this generation
   760  		// or if we need to move our generation forward.
   761  		if !trace.full[gen%2].empty() {
   762  			break
   763  		}
   764  		// Most of the time readerGen is one generation ahead of flushedGen, as the
   765  		// current generation is being read from. Then, once the last buffer is flushed
   766  		// into readerGen, flushedGen will rise to meet it. At this point, the tracer
   767  		// is waiting on the reader to finish flushing the last generation so that it
   768  		// can continue to advance.
   769  		if trace.flushedGen.Load() == gen {
   770  			if trace.shutdown.Load() {
   771  				unlock(&trace.lock)
   772  
   773  				// Wake up anyone waiting for us to be done with this generation.
   774  				//
   775  				// Do this after reading trace.shutdown, because the thread we're
   776  				// waking up is going to clear trace.shutdown.
   777  				if raceenabled {
   778  					// Model synchronization on trace.doneSema, which te race
   779  					// detector does not see. This is required to avoid false
   780  					// race reports on writer passed to trace.Start.
   781  					racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
   782  				}
   783  				semrelease(&trace.doneSema[gen%2])
   784  
   785  				// We're shutting down, and the last generation is fully
   786  				// read. We're done.
   787  				return nil, false
   788  			}
   789  			// The previous gen has had all of its buffers flushed, and
   790  			// there's nothing else for us to read. Advance the generation
   791  			// we're reading from and try again.
   792  			trace.readerGen.Store(trace.gen.Load())
   793  			unlock(&trace.lock)
   794  
   795  			// Wake up anyone waiting for us to be done with this generation.
   796  			//
   797  			// Do this after reading gen to make sure we can't have the trace
   798  			// advance until we've read it.
   799  			if raceenabled {
   800  				// See comment above in the shutdown case.
   801  				racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
   802  			}
   803  			semrelease(&trace.doneSema[gen%2])
   804  
   805  			// Reacquire the lock and go back to the top of the loop.
   806  			lock(&trace.lock)
   807  			continue
   808  		}
   809  		// Wait for new data.
   810  		//
   811  		// We don't simply use a note because the scheduler
   812  		// executes this goroutine directly when it wakes up
   813  		// (also a note would consume an M).
   814  		//
   815  		// Before we drop the lock, clear the workAvailable flag. Work can
   816  		// only be queued with trace.lock held, so this is at least true until
   817  		// we drop the lock.
   818  		trace.workAvailable.Store(false)
   819  		unlock(&trace.lock)
   820  		return nil, true
   821  	}
   822  	// Pull a buffer.
   823  	tbuf := trace.full[gen%2].pop()
   824  	trace.reading = tbuf
   825  	unlock(&trace.lock)
   826  	return tbuf.arr[:tbuf.pos], false
   827  }
   828  
   829  // traceReader returns the trace reader that should be woken up, if any.
   830  // Callers should first check (traceEnabled() || traceShuttingDown()).
   831  //
   832  // This must run on the system stack because it acquires trace.lock.
   833  //
   834  //go:systemstack
   835  func traceReader() *g {
   836  	gp := traceReaderAvailable()
   837  	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
   838  		return nil
   839  	}
   840  	return gp
   841  }
   842  
   843  // traceReaderAvailable returns the trace reader if it is not currently
   844  // scheduled and should be. Callers should first check that
   845  // (traceEnabled() || traceShuttingDown()) is true.
   846  func traceReaderAvailable() *g {
   847  	// There are three conditions under which we definitely want to schedule
   848  	// the reader:
   849  	// - The reader is lagging behind in finishing off the last generation.
   850  	//   In this case, trace buffers could even be empty, but the trace
   851  	//   advancer will be waiting on the reader, so we have to make sure
   852  	//   to schedule the reader ASAP.
   853  	// - The reader has pending work to process for it's reader generation
   854  	//   (assuming readerGen is not lagging behind). Note that we also want
   855  	//   to be careful *not* to schedule the reader if there's no work to do.
   856  	// - The trace is shutting down. The trace stopper blocks on the reader
   857  	//   to finish, much like trace advancement.
   858  	//
   859  	// We also want to be careful not to schedule the reader if there's no
   860  	// reason to.
   861  	if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() {
   862  		return trace.reader.Load()
   863  	}
   864  	return nil
   865  }
   866  
   867  // Trace advancer goroutine.
   868  var traceAdvancer traceAdvancerState
   869  
   870  type traceAdvancerState struct {
   871  	timer *wakeableSleep
   872  	done  chan struct{}
   873  }
   874  
   875  // start starts a new traceAdvancer.
   876  func (s *traceAdvancerState) start() {
   877  	// Start a goroutine to periodically advance the trace generation.
   878  	s.done = make(chan struct{})
   879  	s.timer = newWakeableSleep()
   880  	go func() {
   881  		for traceEnabled() {
   882  			// Set a timer to wake us up
   883  			s.timer.sleep(int64(debug.traceadvanceperiod))
   884  
   885  			// Try to advance the trace.
   886  			traceAdvance(false)
   887  		}
   888  		s.done <- struct{}{}
   889  	}()
   890  }
   891  
   892  // stop stops a traceAdvancer and blocks until it exits.
   893  func (s *traceAdvancerState) stop() {
   894  	s.timer.wake()
   895  	<-s.done
   896  	close(s.done)
   897  	s.timer.close()
   898  }
   899  
   900  // traceAdvancePeriod is the approximate period between
   901  // new generations.
   902  const defaultTraceAdvancePeriod = 1e9 // 1 second.
   903  
   904  // wakeableSleep manages a wakeable goroutine sleep.
   905  //
   906  // Users of this type must call init before first use and
   907  // close to free up resources. Once close is called, init
   908  // must be called before another use.
   909  type wakeableSleep struct {
   910  	timer *timer
   911  
   912  	// lock protects access to wakeup, but not send/recv on it.
   913  	lock   mutex
   914  	wakeup chan struct{}
   915  }
   916  
   917  // newWakeableSleep initializes a new wakeableSleep and returns it.
   918  func newWakeableSleep() *wakeableSleep {
   919  	s := new(wakeableSleep)
   920  	lockInit(&s.lock, lockRankWakeableSleep)
   921  	s.wakeup = make(chan struct{}, 1)
   922  	s.timer = new(timer)
   923  	s.timer.arg = s
   924  	s.timer.f = func(s any, _ uintptr) {
   925  		s.(*wakeableSleep).wake()
   926  	}
   927  	return s
   928  }
   929  
   930  // sleep sleeps for the provided duration in nanoseconds or until
   931  // another goroutine calls wake.
   932  //
   933  // Must not be called by more than one goroutine at a time and
   934  // must not be called concurrently with close.
   935  func (s *wakeableSleep) sleep(ns int64) {
   936  	resetTimer(s.timer, nanotime()+ns)
   937  	lock(&s.lock)
   938  	if raceenabled {
   939  		raceacquire(unsafe.Pointer(&s.lock))
   940  	}
   941  	wakeup := s.wakeup
   942  	if raceenabled {
   943  		racerelease(unsafe.Pointer(&s.lock))
   944  	}
   945  	unlock(&s.lock)
   946  	<-wakeup
   947  	stopTimer(s.timer)
   948  }
   949  
   950  // wake awakens any goroutine sleeping on the timer.
   951  //
   952  // Safe for concurrent use with all other methods.
   953  func (s *wakeableSleep) wake() {
   954  	// Grab the wakeup channel, which may be nil if we're
   955  	// racing with close.
   956  	lock(&s.lock)
   957  	if raceenabled {
   958  		raceacquire(unsafe.Pointer(&s.lock))
   959  	}
   960  	if s.wakeup != nil {
   961  		// Non-blocking send.
   962  		//
   963  		// Others may also write to this channel and we don't
   964  		// want to block on the receiver waking up. This also
   965  		// effectively batches together wakeup notifications.
   966  		select {
   967  		case s.wakeup <- struct{}{}:
   968  		default:
   969  		}
   970  	}
   971  	if raceenabled {
   972  		racerelease(unsafe.Pointer(&s.lock))
   973  	}
   974  	unlock(&s.lock)
   975  }
   976  
   977  // close wakes any goroutine sleeping on the timer and prevents
   978  // further sleeping on it.
   979  //
   980  // Once close is called, the wakeableSleep must no longer be used.
   981  //
   982  // It must only be called once no goroutine is sleeping on the
   983  // timer *and* nothing else will call wake concurrently.
   984  func (s *wakeableSleep) close() {
   985  	// Set wakeup to nil so that a late timer ends up being a no-op.
   986  	lock(&s.lock)
   987  	if raceenabled {
   988  		raceacquire(unsafe.Pointer(&s.lock))
   989  	}
   990  	wakeup := s.wakeup
   991  	s.wakeup = nil
   992  
   993  	// Close the channel.
   994  	close(wakeup)
   995  
   996  	if raceenabled {
   997  		racerelease(unsafe.Pointer(&s.lock))
   998  	}
   999  	unlock(&s.lock)
  1000  	return
  1001  }
  1002
View as plain text