// Copyright 2019 The Bazel Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package starlark // This file defines a simple execution-time profiler for Starlark. // It measures the wall time spent executing Starlark code, and emits a // gzipped protocol message in pprof format (github.com/google/pprof). // // When profiling is enabled, the interpreter calls the profiler to // indicate the start and end of each "span" or time interval. A leaf // function (whether Go or Starlark) has a single span. A function that // calls another function has spans for each interval in which it is the // top of the stack. (A LOAD instruction also ends a span.) // // At the start of a span, the interpreter records the current time in // the thread's topmost frame. At the end of the span, it obtains the // time again and subtracts the span start time. The difference is added // to an accumulator variable in the thread. If the accumulator exceeds // some fixed quantum (10ms, say), the profiler records the current call // stack and sends it to the profiler goroutine, along with the number // of quanta, which are subtracted. For example, if the accumulator // holds 3ms and then a completed span adds 25ms to it, its value is 28ms, // which exceeeds 10ms. The profiler records a stack with the value 20ms // (2 quanta), and the accumulator is left with 8ms. // // The profiler goroutine converts the stacks into the pprof format and // emits a gzip-compressed protocol message to the designated output // file. We use a hand-written streaming proto encoder to avoid // dependencies on pprof and proto, and to avoid the need to // materialize the profile data structure in memory. // // A limitation of this profiler is that it measures wall time, which // does not necessarily correspond to CPU time. A CPU profiler requires // that only running (not runnable) threads are sampled; this is // commonly achieved by having the kernel deliver a (PROF) signal to an // arbitrary running thread, through setitimer(2). The CPU profiler in the // Go runtime uses this mechanism, but it is not possible for a Go // application to register a SIGPROF handler, nor is it possible for a // Go handler for some other signal to read the stack pointer of // the interrupted thread. // // Two caveats: // (1) it is tempting to send the leaf Frame directly to the profiler // goroutine instead of making a copy of the stack, since a Frame is a // spaghetti stack--a linked list. However, as soon as execution // resumes, the stack's Frame.pc values may be mutated, so Frames are // not safe to share with the asynchronous profiler goroutine. // (2) it is tempting to use Callables as keys in a map when tabulating // the pprof protocols's Function entities. However, we cannot assume // that Callables are valid map keys, and furthermore we must not // pin function values in memory indefinitely as this may cause lambda // values to keep their free variables live much longer than necessary. // TODO(adonovan): // - make Start/Stop fully thread-safe. // - fix the pc hack. // - experiment with other values of quantum. import ( "bufio" "bytes" "compress/gzip" "encoding/binary" "fmt" "io" "log" "reflect" "sync/atomic" "time" "unsafe" "go.starlark.net/syntax" ) // StartProfile enables time profiling of all Starlark threads, // and writes a profile in pprof format to w. // It must be followed by a call to StopProfiler to stop // the profiler and finalize the profile. // // StartProfile returns an error if profiling was already enabled. // // StartProfile must not be called concurrently with Starlark execution. func StartProfile(w io.Writer) error { if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) { return fmt.Errorf("profiler already running") } // TODO(adonovan): make the API fully concurrency-safe. // The main challenge is racy reads/writes of profiler.events, // and of send/close races on the channel it refers to. // It's easy to solve them with a mutex but harder to do // it efficiently. profiler.events = make(chan *profEvent, 1) profiler.done = make(chan error) go profile(w) return nil } // StopProfiler stops the profiler started by a prior call to // StartProfile and finalizes the profile. It returns an error if the // profile could not be completed. // // StopProfiler must not be called concurrently with Starlark execution. func StopProfile() error { // Terminate the profiler goroutine and get its result. close(profiler.events) err := <-profiler.done profiler.done = nil profiler.events = nil atomic.StoreUint32(&profiler.on, 0) return err } // globals var profiler struct { on uint32 // nonzero => profiler running events chan *profEvent // profile events from interpreter threads done chan error // indicates profiler goroutine is ready } func (thread *Thread) beginProfSpan() { if profiler.events == nil { return // profiling not enabled } thread.frameAt(0).spanStart = nanotime() } // TODO(adonovan): experiment with smaller values, // which trade space and time for greater precision. const quantum = 10 * time.Millisecond func (thread *Thread) endProfSpan() { if profiler.events == nil { return // profiling not enabled } // Add the span to the thread's accumulator. thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart) if thread.proftime < quantum { return } // Only record complete quanta. n := thread.proftime / quantum thread.proftime -= n * quantum // Copy the stack. // (We can't save thread.frame because its pc will change.) ev := &profEvent{ thread: thread, time: n * quantum, } ev.stack = ev.stackSpace[:0] for i := range thread.stack { fr := thread.frameAt(i) ev.stack = append(ev.stack, profFrame{ pos: fr.Position(), fn: fr.Callable(), pc: fr.pc, }) } profiler.events <- ev } type profEvent struct { thread *Thread // currently unused time time.Duration stack []profFrame stackSpace [8]profFrame // initial space for stack } type profFrame struct { fn Callable // don't hold this live for too long (prevents GC of lambdas) pc uint32 // program counter (Starlark frames only) pos syntax.Position // position of pc within this frame } // profile is the profiler goroutine. // It runs until StopProfiler is called. func profile(w io.Writer) { // Field numbers from pprof protocol. // See https://github.com/google/pprof/blob/master/proto/profile.proto const ( Profile_sample_type = 1 // repeated ValueType Profile_sample = 2 // repeated Sample Profile_mapping = 3 // repeated Mapping Profile_location = 4 // repeated Location Profile_function = 5 // repeated Function Profile_string_table = 6 // repeated string Profile_time_nanos = 9 // int64 Profile_duration_nanos = 10 // int64 Profile_period_type = 11 // ValueType Profile_period = 12 // int64 ValueType_type = 1 // int64 ValueType_unit = 2 // int64 Sample_location_id = 1 // repeated uint64 Sample_value = 2 // repeated int64 Sample_label = 3 // repeated Label Label_key = 1 // int64 Label_str = 2 // int64 Label_num = 3 // int64 Label_num_unit = 4 // int64 Location_id = 1 // uint64 Location_mapping_id = 2 // uint64 Location_address = 3 // uint64 Location_line = 4 // repeated Line Line_function_id = 1 // uint64 Line_line = 2 // int64 Function_id = 1 // uint64 Function_name = 2 // int64 Function_system_name = 3 // int64 Function_filename = 4 // int64 Function_start_line = 5 // int64 ) bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks gz := gzip.NewWriter(bufw) enc := protoEncoder{w: gz} // strings stringIndex := make(map[string]int64) str := func(s string) int64 { i, ok := stringIndex[s] if !ok { i = int64(len(stringIndex)) enc.string(Profile_string_table, s) stringIndex[s] = i } return i } str("") // entry 0 // functions // // function returns the ID of a Callable for use in Line.FunctionId. // The ID is the same as the function's logical address, // which is supplied by the caller to avoid the need to recompute it. functionId := make(map[uintptr]uint64) function := func(fn Callable, addr uintptr) uint64 { id, ok := functionId[addr] if !ok { id = uint64(addr) var pos syntax.Position if fn, ok := fn.(callableWithPosition); ok { pos = fn.Position() } name := fn.Name() if name == "" { name = pos.Filename() } nameIndex := str(name) fun := new(bytes.Buffer) funenc := protoEncoder{w: fun} funenc.uint(Function_id, id) funenc.int(Function_name, nameIndex) funenc.int(Function_system_name, nameIndex) funenc.int(Function_filename, str(pos.Filename())) funenc.int(Function_start_line, int64(pos.Line)) enc.bytes(Profile_function, fun.Bytes()) functionId[addr] = id } return id } // locations // // location returns the ID of the location denoted by fr. // For Starlark frames, this is the Frame pc. locationId := make(map[uintptr]uint64) location := func(fr profFrame) uint64 { fnAddr := profFuncAddr(fr.fn) // For Starlark functions, the frame position // represents the current PC value. // Mix it into the low bits of the address. // This is super hacky and may result in collisions // in large functions or if functions are numerous. // TODO(adonovan): fix: try making this cleaner by treating // each bytecode segment as a Profile.Mapping. pcAddr := fnAddr if _, ok := fr.fn.(*Function); ok { pcAddr = (pcAddr << 16) ^ uintptr(fr.pc) } id, ok := locationId[pcAddr] if !ok { id = uint64(pcAddr) line := new(bytes.Buffer) lineenc := protoEncoder{w: line} lineenc.uint(Line_function_id, function(fr.fn, fnAddr)) lineenc.int(Line_line, int64(fr.pos.Line)) loc := new(bytes.Buffer) locenc := protoEncoder{w: loc} locenc.uint(Location_id, id) locenc.uint(Location_address, uint64(pcAddr)) locenc.bytes(Location_line, line.Bytes()) enc.bytes(Profile_location, loc.Bytes()) locationId[pcAddr] = id } return id } wallNanos := new(bytes.Buffer) wnenc := protoEncoder{w: wallNanos} wnenc.int(ValueType_type, str("wall")) wnenc.int(ValueType_unit, str("nanoseconds")) // informational fields of Profile enc.bytes(Profile_sample_type, wallNanos.Bytes()) enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile startNano := nanotime() // Read profile events from the channel // until it is closed by StopProfiler. for e := range profiler.events { sample := new(bytes.Buffer) sampleenc := protoEncoder{w: sample} sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds for _, fr := range e.stack { sampleenc.uint(Sample_location_id, location(fr)) } enc.bytes(Profile_sample, sample.Bytes()) } endNano := nanotime() enc.int(Profile_duration_nanos, endNano-startNano) err := gz.Close() // Close reports any prior write error if flushErr := bufw.Flush(); err == nil { err = flushErr } profiler.done <- err } // nanotime returns the time in nanoseconds since epoch. // It is implemented by runtime.nanotime using the linkname hack; // runtime.nanotime is defined for all OSs/ARCHS and uses the // monotonic system clock, which there is no portable way to access. // Should that function ever go away, these alternatives exist: // // // POSIX only. REALTIME not MONOTONIC. 17ns. // var tv syscall.Timeval // syscall.Gettimeofday(&tv) // can't fail // return tv.Nano() // // // Portable. REALTIME not MONOTONIC. 46ns. // return time.Now().Nanoseconds() // // // POSIX only. Adds a dependency. // import "golang.org/x/sys/unix" // var ts unix.Timespec // unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail // return unix.TimespecToNsec(ts) // //go:linkname nanotime runtime.nanotime func nanotime() int64 // profFuncAddr returns the canonical "address" // of a Callable for use by the profiler. func profFuncAddr(fn Callable) uintptr { switch fn := fn.(type) { case *Builtin: return reflect.ValueOf(fn.fn).Pointer() case *Function: return uintptr(unsafe.Pointer(fn.funcode)) } // User-defined callable types are typically of // of kind pointer-to-struct. Handle them specially. if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr { return v.Pointer() } // Address zero is reserved by the protocol. // Use 1 for callables we don't recognize. log.Printf("Starlark profiler: no address for Callable %T", fn) return 1 } // We encode the protocol message by hand to avoid making // the interpreter depend on both github.com/google/pprof // and github.com/golang/protobuf. // // This also avoids the need to materialize a protocol message object // tree of unbounded size and serialize it all at the end. // The pprof format appears to have been designed to // permit streaming implementations such as this one. // // See https://developers.google.com/protocol-buffers/docs/encoding. type protoEncoder struct { w io.Writer // *bytes.Buffer or *gzip.Writer tmp [binary.MaxVarintLen64]byte } func (e *protoEncoder) uvarint(x uint64) { n := binary.PutUvarint(e.tmp[:], x) e.w.Write(e.tmp[:n]) } func (e *protoEncoder) tag(field, wire uint) { e.uvarint(uint64(field<<3 | wire)) } func (e *protoEncoder) string(field uint, s string) { e.tag(field, 2) // length-delimited e.uvarint(uint64(len(s))) io.WriteString(e.w, s) } func (e *protoEncoder) bytes(field uint, b []byte) { e.tag(field, 2) // length-delimited e.uvarint(uint64(len(b))) e.w.Write(b) } func (e *protoEncoder) uint(field uint, x uint64) { e.tag(field, 0) // varint e.uvarint(x) } func (e *protoEncoder) int(field uint, x int64) { e.tag(field, 0) // varint e.uvarint(uint64(x)) }