1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
|
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crashmonitor
// This file defines a monitor that reports arbitrary Go runtime
// crashes to telemetry.
import (
"bytes"
"fmt"
"io"
"log"
"os"
"os/exec"
"reflect"
"runtime/debug"
"strconv"
"strings"
"cuelang.org/go/internal/golangorgx/telemetry/internal/counter"
)
// Supported reports whether the runtime supports [runtime.SetCrashOutput].
//
// TODO(adonovan): eliminate once go1.23+ is assured.
func Supported() bool { return setCrashOutput != nil }
var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+
// Start starts the monitor process, which performs automated
// reporting of unexpected crashes via Go telemetry. Call this
// function once immediately after [counter.Open]() within the main
// function of your application, before argument parsing.
//
// This function re-executes the current executable as a child
// process, in a special mode. In that mode, the call to Start will
// never return.
//
// The application should avoid doing expensive work in init functions
// as they will be executed twice. Run with GODEBUG=inittrace=1 to
// display the running time of each package initializer.
//
// Start uses the [debug.SetCrashOutput] mechanism, which is a
// process-wide resource. Do not make other calls to that function
// within your application. Start is a no-op unless the program is
// built with go1.23+.
func Start() {
if !Supported() {
return
}
const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR"
if os.Getenv(crashmonitorVar) != "" {
// This process is the crashmonitor (child).
log.SetFlags(0)
log.SetPrefix("crashmonitor: ")
// Wait for parent process's dying gasp.
// If the parent dies for any reason this read will return.
data, err := io.ReadAll(os.Stdin)
if err != nil {
log.Fatalf("failed to read from input pipe: %v", err)
}
// If the only line is the sentinel, it wasn't a crash.
if bytes.Count(data, []byte("\n")) < 2 {
os.Exit(0) // parent exited without crash report
}
log.Printf("parent reported crash:\n%s", data)
// Parse the stack out of the crash report
// and record a telemetry count for it.
name, err := telemetryCounterName(data)
if err != nil {
// Keep count of how often this happens
// so that we can investigate if necessary.
incrementCounter("crash/malformed")
// Something went wrong.
// Save the crash securely in the file system.
f, err := os.CreateTemp(os.TempDir(), "*.crash")
if err != nil {
log.Fatal(err)
}
if _, err := f.Write(data); err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
log.Printf("failed to report crash to telemetry: %v", err)
log.Fatalf("crash report saved at %s", f.Name())
}
incrementCounter(name)
log.Fatalf("telemetry crash recorded")
}
// This process is the application (parent).
// Fork+exec the crashmonitor (child).
exe, err := os.Executable()
if err != nil {
log.Fatal(err)
}
cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1)
cmd.Env = append(os.Environ(), crashmonitorVar+"=1")
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stderr
pipe, err := cmd.StdinPipe()
if err != nil {
log.Fatalf("StdinPipe: %v", err)
}
writeSentinel(pipe)
// Ensure that we get pc=0x%x values in the traceback.
debug.SetTraceback("system")
setCrashOutput(pipe.(*os.File)) // (this conversion is safe)
if err := cmd.Start(); err != nil {
log.Fatalf("can't start crash monitor: %v", err)
}
// Now return and run the application proper...
}
// (stubbed by test)
var incrementCounter = func(name string) { counter.New(name).Inc() }
// The sentinel function returns its address. The difference between
// this value as observed by calls in two different processes of the
// same executable tells us the relative offset of their text segments.
//
// It would be nice if SetCrashOutput took care of this as it's fiddly
// and likely to confuse every user at first.
func sentinel() uint64 {
return uint64(reflect.ValueOf(sentinel).Pointer())
}
func writeSentinel(out io.Writer) {
fmt.Fprintf(out, "sentinel %x\n", sentinel())
}
// telemetryCounterName parses a crash report produced by the Go
// runtime, extracts the stack of the first runnable goroutine,
// converts each line into telemetry form ("symbol:relative-line"),
// and returns this as the name of a counter.
func telemetryCounterName(crash []byte) (string, error) {
pcs, err := parseStackPCs(string(crash))
if err != nil {
return "", err
}
// Limit the number of frames we request.
pcs = pcs[:min(len(pcs), 16)]
if len(pcs) == 0 {
// This can occur if all goroutines are idle, as when
// caught in a deadlock, or killed by an async signal
// while blocked.
//
// TODO(adonovan): consider how to report such
// situations. Reporting a goroutine in [sleep] or
// [select] state could be quite confusing without
// further information about the nature of the crash,
// as the problem is not local to the code location.
//
// For now, we keep count of this situation so that we
// can access whether it needs a more involved solution.
return "crash/no-running-goroutine", nil
}
// This string appears at the start of all
// crashmonitor-generated counter names.
//
// It is tempting to expose this as a parameter of Start, but
// it is not without risk. What value should most programs
// provide? There's no point giving the name of the executable
// as this is already recorded by telemetry. What if the
// application runs in multiple modes? Then it might be useful
// to record the mode. The problem is that an application with
// multiple modes probably doesn't know its mode by line 1 of
// main.main: it might require flag or argument parsing, or
// even validation of an environment variable, and we really
// want to steer users aware from any logic before Start. The
// flags and arguments will be wrong in the child process, and
// every extra conditional branch creates a risk that the
// recursively executed child program will behave not like the
// monitor but like the application. If the child process
// exits before calling Start, then the parent application
// will not have a monitor, and its crash reports will be
// discarded (written in to a pipe that is never read).
//
// So for now, we use this constant string.
const prefix = "crash/crash"
return counter.EncodeStack(pcs, prefix), nil
}
// parseStackPCs parses the parent process's program counters for the
// first running goroutine out of a GOTRACEBACK=system traceback,
// adjusting them so that they are valid for the child process's text
// segment.
//
// This function returns only program counter values, ensuring that
// there is no possibility of strings from the crash report (which may
// contain PII) leaking into the telemetry system.
func parseStackPCs(crash string) ([]uintptr, error) {
// getPC parses the PC out of a line of the form:
// \tFILE:LINE +0xRELPC sp=... fp=... pc=...
getPC := func(line string) (uint64, error) {
_, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x
if !ok {
return 0, fmt.Errorf("no pc= for stack frame: %s", line)
}
return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix
}
var (
pcs []uintptr
parentSentinel uint64
childSentinel = sentinel()
on = false // are we in the first running goroutine?
lines = strings.Split(crash, "\n")
)
for i := 0; i < len(lines); i++ {
line := lines[i]
// Read sentinel value.
if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") {
_, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel)
if err != nil {
return nil, fmt.Errorf("can't read sentinel line")
}
continue
}
// Search for "goroutine GID [STATUS]"
if !on {
if strings.HasPrefix(line, "goroutine ") &&
strings.Contains(line, " [running]:") {
on = true
if parentSentinel == 0 {
return nil, fmt.Errorf("no sentinel value in crash report")
}
}
continue
}
// A blank line marks end of a goroutine stack.
if line == "" {
break
}
// Skip the final "created by SYMBOL in goroutine GID" part.
if strings.HasPrefix(line, "created by ") {
break
}
// Expect a pair of lines:
// SYMBOL(ARGS)
// \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x
// Note: SYMBOL may contain parens "pkg.(*T).method"
// The RELPC is sometimes missing.
// Skip the symbol(args) line.
i++
if i == len(lines) {
break
}
line = lines[i]
// Parse the PC, and correct for the parent and child's
// different mappings of the text section.
pc, err := getPC(line)
if err != nil {
// Inlined frame, perhaps; skip it.
continue
}
pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel))
}
return pcs, nil
}
func min(x, y int) int {
if x < y {
return x
} else {
return y
}
}
|