File: metrics.go

package info (click to toggle)
golang-k8s-sigs-apiserver-network-proxy 0.33.0%2Bds1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,068 kB
  • sloc: makefile: 220; sh: 118
file content (343 lines) | stat: -rw-r--r-- 12,300 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/*
Copyright 2020 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
	"strconv"
	"time"

	"github.com/prometheus/client_golang/prometheus"

	commonmetrics "sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/common/metrics"
	"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/proto/client"
)

const (
	Namespace = "konnectivity_network_proxy"
	Subsystem = "server"

	// Proxy is the ProxyService method used to handle incoming streams.
	Proxy = "Proxy"
	// Connect is the AgentService method used to establish next hop.
	Connect = "Connect"
)

var (
	// Use buckets ranging from 10 ns to 12.5 seconds.
	latencyBuckets = []float64{0.000001, 0.00001, 0.0001, 0.005, 0.025, 0.1, 0.5, 2.5, 12.5}

	// Metrics provides access to all dial metrics.
	Metrics = newServerMetrics()
)

// ServerMetrics includes all the metrics of the proxy server.
type ServerMetrics struct {
	endpointLatencies    *prometheus.HistogramVec
	frontendLatencies    *prometheus.HistogramVec
	grpcConnections      *prometheus.GaugeVec
	httpConnections      prometheus.Gauge
	backend              *prometheus.GaugeVec
	pendingDials         *prometheus.GaugeVec
	establishedConns     *prometheus.GaugeVec
	fullRecvChannels     *prometheus.GaugeVec
	dialFailures         *prometheus.CounterVec
	streamPackets        *prometheus.CounterVec
	streamErrors         *prometheus.CounterVec
	culledLeases         prometheus.Counter
	leaseDeleteLatencies *prometheus.HistogramVec
	leaseDeletes         *prometheus.CounterVec
	leaseListLatencies   *prometheus.HistogramVec
	leaseLists           *prometheus.CounterVec
}

// newServerMetrics create a new ServerMetrics, configured with default metric names.
func newServerMetrics() *ServerMetrics {
	endpointLatencies := prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "dial_duration_seconds",
			Help:      "Latency of dial to the remote endpoint in seconds",
			Buckets:   latencyBuckets,
		},
		[]string{},
	)
	frontendLatencies := prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "frontend_write_duration_seconds",
			Help:      "Latency of write to the frontend in seconds",
			Buckets:   latencyBuckets,
		},
		[]string{},
	)
	grpcConnections := prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "grpc_connections",
			Help:      "Number of current grpc connections, partitioned by service method.",
		},
		[]string{
			"service_method",
		},
	)
	httpConnections := prometheus.NewGauge(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "http_connections",
			Help:      "Number of current HTTP CONNECT connections",
		},
	)
	backend := prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "ready_backend_connections",
			Help:      "Number of konnectivity agent connected to the proxy server",
		},
		[]string{},
	)
	pendingDials := prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "pending_backend_dials",
			Help:      "Current number of pending backend dial requests",
		},
		[]string{},
	)
	establishedConns := prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "established_connections",
			Help:      "Current number of established end-to-end connections (post-dial).",
		},
		[]string{},
	)
	fullRecvChannels := prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "full_receive_channels",
			Help:      "Number of current connections blocked by a full receive channel, partitioned by service method.",
		},
		[]string{
			"service_method",
		},
	)
	dialFailures := prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "dial_failure_count",
			Help:      "Number of dial failures observed. Multiple failures can occur for a single dial request.",
		},
		[]string{
			"reason",
		},
	)
	culledLeases := prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Subsystem: Subsystem,
		Name:      "culled_leases_count",
		Help:      "Count of expired leases that the lease garbage collection controller has culled.",
	})
	leaseDeleteLatencies := prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "lease_delete_latency_seconds",
			Help:      "Latency of lease deletion calls by the garbage collection controller in seconds.",
		},
		[]string{"http_status_code"},
	)
	leaseDeletes := prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "lease_delete_total",
			Help:      "Count of lease delection calls by the garbage collection controller. Labeled by HTTP status code and reason.",
		},
		[]string{"http_status_code", "reason"},
	)
	leaseListLatencies := prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "lease_list_latency_seconds",
			Help:      "Latency of lease list calls by the garbage collection controller in seconds.",
		},
		[]string{"http_status_code"},
	)
	leaseLists := prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: Namespace,
			Subsystem: Subsystem,
			Name:      "lease_list_total",
			Help:      "Count of lease deletion calls by the garbage collection controller. Labeled by HTTP status code and reason.",
		},
		[]string{"http_status_code", "reason"},
	)
	streamPackets := commonmetrics.MakeStreamPacketsTotalMetric(Namespace, Subsystem)
	streamErrors := commonmetrics.MakeStreamErrorsTotalMetric(Namespace, Subsystem)
	prometheus.MustRegister(endpointLatencies)
	prometheus.MustRegister(frontendLatencies)
	prometheus.MustRegister(grpcConnections)
	prometheus.MustRegister(httpConnections)
	prometheus.MustRegister(backend)
	prometheus.MustRegister(pendingDials)
	prometheus.MustRegister(establishedConns)
	prometheus.MustRegister(fullRecvChannels)
	prometheus.MustRegister(dialFailures)
	prometheus.MustRegister(streamPackets)
	prometheus.MustRegister(streamErrors)
	prometheus.MustRegister(culledLeases)
	prometheus.MustRegister(leaseDeleteLatencies)
	prometheus.MustRegister(leaseDeletes)
	prometheus.MustRegister(leaseListLatencies)
	prometheus.MustRegister(leaseLists)
	return &ServerMetrics{
		endpointLatencies:    endpointLatencies,
		frontendLatencies:    frontendLatencies,
		grpcConnections:      grpcConnections,
		httpConnections:      httpConnections,
		backend:              backend,
		pendingDials:         pendingDials,
		establishedConns:     establishedConns,
		fullRecvChannels:     fullRecvChannels,
		dialFailures:         dialFailures,
		streamPackets:        streamPackets,
		streamErrors:         streamErrors,
		culledLeases:         culledLeases,
		leaseDeleteLatencies: leaseDeleteLatencies,
		leaseDeletes:         leaseDeletes,
		leaseListLatencies:   leaseListLatencies,
		leaseLists:           leaseLists,
	}
}

// Reset resets the metrics.
func (s *ServerMetrics) Reset() {
	s.endpointLatencies.Reset()
	s.frontendLatencies.Reset()
	s.grpcConnections.Reset()
	s.backend.Reset()
	s.pendingDials.Reset()
	s.establishedConns.Reset()
	s.fullRecvChannels.Reset()
	s.dialFailures.Reset()
	s.streamPackets.Reset()
	s.streamErrors.Reset()
}

// CulledLeasesInc increments the number of leases that the GC controller has culled.
func (s *ServerMetrics) CulledLeasesInc() {
	s.culledLeases.Inc()
}

// ObserveDialLatency records the latency of dial to the remote endpoint.
func (s *ServerMetrics) ObserveDialLatency(elapsed time.Duration) {
	s.endpointLatencies.WithLabelValues().Observe(elapsed.Seconds())
}

// ObserveFrontendWriteLatency records the latency of blocking on stream send to the client.
func (s *ServerMetrics) ObserveFrontendWriteLatency(elapsed time.Duration) {
	s.frontendLatencies.WithLabelValues().Observe(elapsed.Seconds())
}

// ConnectionInc increments a new grpc client connection.
func (s *ServerMetrics) ConnectionInc(serviceMethod string) {
	s.grpcConnections.With(prometheus.Labels{"service_method": serviceMethod}).Inc()
}

// ConnectionDec decrements a finished grpc client connection.
func (s *ServerMetrics) ConnectionDec(serviceMethod string) {
	s.grpcConnections.With(prometheus.Labels{"service_method": serviceMethod}).Dec()
}

// HTTPConnectionDec increments a new HTTP CONNECTION connection.
func (s *ServerMetrics) HTTPConnectionInc() { s.httpConnections.Inc() }

// HTTPConnectionDec decrements a finished HTTP CONNECTION connection.
func (s *ServerMetrics) HTTPConnectionDec() { s.httpConnections.Dec() }

// SetBackendCount sets the number of backend connection.
func (s *ServerMetrics) SetBackendCount(count int) {
	s.backend.WithLabelValues().Set(float64(count))
}

// SetPendingDialCount sets the number of pending dials.
func (s *ServerMetrics) SetPendingDialCount(count int) {
	s.pendingDials.WithLabelValues().Set(float64(count))
}

// SetEstablishedConnCount sets the number of established connections.
func (s *ServerMetrics) SetEstablishedConnCount(count int) {
	s.establishedConns.WithLabelValues().Set(float64(count))
}

// FullRecvChannel retrieves the metric for counting full receive channels.
func (s *ServerMetrics) FullRecvChannel(serviceMethod string) prometheus.Gauge {
	return s.fullRecvChannels.With(prometheus.Labels{"service_method": serviceMethod})
}

type DialFailureReason string

const (
	DialFailureNoAgent              DialFailureReason = "no_agent"              // No available agent is connected.
	DialFailureErrorResponse        DialFailureReason = "error_response"        // Dial failure reported by the agent back to the server.
	DialFailureUnrecognizedResponse DialFailureReason = "unrecognized_response" // Dial repsonse received for unrecognozide dial ID.
	DialFailureSendResponse         DialFailureReason = "send_rsp"              // Successful dial response from agent, but failed to send to frontend.
	DialFailureBackendClose         DialFailureReason = "backend_close"         // Received a DIAL_CLS from the backend before the dial completed.
	DialFailureFrontendClose        DialFailureReason = "frontend_close"        // Received a DIAL_CLS from the frontend before the dial completed.
)

func (s *ServerMetrics) ObserveDialFailure(reason DialFailureReason) {
	s.dialFailures.With(prometheus.Labels{"reason": string(reason)}).Inc()
}

func (s *ServerMetrics) ObservePacket(segment commonmetrics.Segment, packetType client.PacketType) {
	commonmetrics.ObservePacket(s.streamPackets, segment, packetType)
}

func (s *ServerMetrics) ObserveStreamErrorNoPacket(segment commonmetrics.Segment, err error) {
	commonmetrics.ObserveStreamErrorNoPacket(s.streamErrors, segment, err)
}

func (s *ServerMetrics) ObserveStreamError(segment commonmetrics.Segment, err error, packetType client.PacketType) {
	commonmetrics.ObserveStreamError(s.streamErrors, segment, err, packetType)
}

func (s *ServerMetrics) ObserveLeaseDeleteLatency(httpCode int, latency time.Duration) {
	s.leaseDeleteLatencies.WithLabelValues(strconv.Itoa(httpCode)).Observe(latency.Seconds())
}

func (s *ServerMetrics) ObserveLeaseDelete(httpCode int, reason string) {
	s.leaseDeletes.WithLabelValues(strconv.Itoa(httpCode), reason).Inc()
}

func (s *ServerMetrics) ObserveLeaseListLatency(httpCode int, latency time.Duration) {
	s.leaseListLatencies.WithLabelValues(strconv.Itoa(httpCode)).Observe(latency.Seconds())
}

func (s *ServerMetrics) ObserveLeaseList(httpCode int, reason string) {
	s.leaseLists.WithLabelValues(strconv.Itoa(httpCode), reason).Inc()
}