1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
|
package session
import (
"context"
"math"
"net"
"sync/atomic"
"time"
"github.com/containerd/containerd/defaults"
"github.com/moby/buildkit/util/bklog"
"github.com/moby/buildkit/util/grpcerrors"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.opentelemetry.io/otel/trace"
"golang.org/x/net/http2"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/health/grpc_health_v1"
)
func serve(ctx context.Context, grpcServer *grpc.Server, conn net.Conn) {
go func() {
<-ctx.Done()
conn.Close()
}()
bklog.G(ctx).Debugf("serving grpc connection")
(&http2.Server{}).ServeConn(conn, &http2.ServeConnOpts{Handler: grpcServer})
}
func grpcClientConn(ctx context.Context, conn net.Conn) (context.Context, *grpc.ClientConn, error) {
var unary []grpc.UnaryClientInterceptor
var stream []grpc.StreamClientInterceptor
var dialCount int64
dialer := grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
if c := atomic.AddInt64(&dialCount, 1); c > 1 {
return nil, errors.Errorf("only one connection allowed")
}
return conn, nil
})
dialOpts := []grpc.DialOption{
dialer,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(defaults.DefaultMaxRecvMsgSize)),
grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(defaults.DefaultMaxSendMsgSize)),
}
if span := trace.SpanFromContext(ctx); span.SpanContext().IsValid() {
unary = append(unary, filterClient(otelgrpc.UnaryClientInterceptor(otelgrpc.WithTracerProvider(span.TracerProvider()), otelgrpc.WithPropagators(propagators)))) //nolint:staticcheck // TODO(thaJeztah): ignore SA1019 for deprecated options: see https://github.com/moby/buildkit/issues/4681
stream = append(stream, otelgrpc.StreamClientInterceptor(otelgrpc.WithTracerProvider(span.TracerProvider()), otelgrpc.WithPropagators(propagators))) //nolint:staticcheck // TODO(thaJeztah): ignore SA1019 for deprecated options: see https://github.com/moby/buildkit/issues/4681
}
unary = append(unary, grpcerrors.UnaryClientInterceptor)
stream = append(stream, grpcerrors.StreamClientInterceptor)
if len(unary) == 1 {
dialOpts = append(dialOpts, grpc.WithUnaryInterceptor(unary[0]))
} else if len(unary) > 1 {
dialOpts = append(dialOpts, grpc.WithUnaryInterceptor(grpc_middleware_ChainUnaryClient(unary...)))
}
if len(stream) == 1 {
dialOpts = append(dialOpts, grpc.WithStreamInterceptor(stream[0]))
} else if len(stream) > 1 {
dialOpts = append(dialOpts, grpc.WithStreamInterceptor(grpc_middleware_ChainStreamClient(stream...)))
}
cc, err := grpc.DialContext(ctx, "localhost", dialOpts...)
if err != nil {
return nil, nil, errors.Wrap(err, "failed to create grpc client")
}
ctx, cancel := context.WithCancelCause(ctx)
go monitorHealth(ctx, cc, cancel)
return ctx, cc, nil
}
func monitorHealth(ctx context.Context, cc *grpc.ClientConn, cancelConn func(error)) {
defer cancelConn(errors.WithStack(context.Canceled))
defer cc.Close()
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
healthClient := grpc_health_v1.NewHealthClient(cc)
failedBefore := false
consecutiveSuccessful := 0
defaultHealthcheckDuration := 30 * time.Second
lastHealthcheckDuration := time.Duration(0)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
// This healthcheck can erroneously fail in some instances, such as receiving lots of data in a low-bandwidth scenario or too many concurrent builds.
// So, this healthcheck is purposely long, and can tolerate some failures on purpose.
healthcheckStart := time.Now()
timeout := time.Duration(math.Max(float64(defaultHealthcheckDuration), float64(lastHealthcheckDuration)*1.5))
ctx, cancel := context.WithCancelCause(ctx)
ctx, _ = context.WithTimeoutCause(ctx, timeout, errors.WithStack(context.DeadlineExceeded))
_, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
cancel(errors.WithStack(context.Canceled))
lastHealthcheckDuration = time.Since(healthcheckStart)
logFields := logrus.Fields{
"timeout": timeout,
"actualDuration": lastHealthcheckDuration,
}
if err != nil {
select {
case <-ctx.Done():
return
default:
}
if failedBefore {
bklog.G(ctx).Error("healthcheck failed fatally")
return
}
failedBefore = true
consecutiveSuccessful = 0
bklog.G(ctx).WithFields(logFields).Warn("healthcheck failed")
} else {
consecutiveSuccessful++
if consecutiveSuccessful >= 5 && failedBefore {
failedBefore = false
bklog.G(ctx).WithFields(logFields).Debug("reset healthcheck failure")
}
}
bklog.G(ctx).WithFields(logFields).Trace("healthcheck completed")
}
}
}
|