1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stack
import (
"context"
"time"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/internal/tcp"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
)
// contextID is this package's type for context.Context.Value keys.
type contextID int
const (
// CtxRestoreStack is a Context.Value key for the stack to be used in restore.
CtxRestoreStack contextID = iota
)
// RestoreStackFromContext returns the stack to be used during restore.
func RestoreStackFromContext(ctx context.Context) *Stack {
return ctx.Value(CtxRestoreStack).(*Stack)
}
// TCPProbeFunc is the expected function type for a TCP probe function to be
// passed to stack.AddTCPProbe.
type TCPProbeFunc func(s *TCPEndpointState)
// TCPCubicState is used to hold a copy of the internal cubic state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPCubicState struct {
// WLastMax is the previous wMax value.
WLastMax float64
// WMax is the value of the congestion window at the time of the last
// congestion event.
WMax float64
// T is the time when the current congestion avoidance was entered.
T tcpip.MonotonicTime
// TimeSinceLastCongestion denotes the time since the current
// congestion avoidance was entered.
TimeSinceLastCongestion time.Duration
// C is the cubic constant as specified in RFC8312, page 11.
C float64
// K is the time period (in seconds) that the above function takes to
// increase the current window size to WMax if there are no further
// congestion events and is calculated using the following equation:
//
// K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
K float64
// Beta is the CUBIC multiplication decrease factor. That is, when a
// congestion event is detected, CUBIC reduces its cwnd to
// WC(0)=WMax*beta_cubic.
Beta float64
// WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
// calculated using the formula:
//
// WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
WC float64
// WEst is the window computed by CUBIC at time
// TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
WEst float64
// EndSeq is the sequence number that, when cumulatively ACK'd, ends the
// HyStart round.
EndSeq seqnum.Value
// CurrRTT is the minimum round-trip time from the current round.
CurrRTT time.Duration
// LastRTT is the minimum round-trip time from the previous round.
LastRTT time.Duration
// SampleCount is the number of samples from the current round.
SampleCount uint
// LastAck is the time we received the most recent ACK (or start of round if
// more recent).
LastAck tcpip.MonotonicTime
// RoundStart is the time we started the most recent HyStart round.
RoundStart tcpip.MonotonicTime
}
// TCPRACKState is used to hold a copy of the internal RACK state when the
// TCPProbeFunc is invoked.
//
// +stateify savable
type TCPRACKState struct {
// XmitTime is the transmission timestamp of the most recent
// acknowledged segment.
XmitTime tcpip.MonotonicTime
// EndSequence is the ending TCP sequence number of the most recent
// acknowledged segment.
EndSequence seqnum.Value
// FACK is the highest selectively or cumulatively acknowledged
// sequence.
FACK seqnum.Value
// RTT is the round trip time of the most recently delivered packet on
// the connection (either cumulatively acknowledged or selectively
// acknowledged) that was not marked invalid as a possible spurious
// retransmission.
RTT time.Duration
// Reord is true iff reordering has been detected on this connection.
Reord bool
// DSACKSeen is true iff the connection has seen a DSACK.
DSACKSeen bool
// ReoWnd is the reordering window time used for recording packet
// transmission times. It is used to defer the moment at which RACK
// marks a packet lost.
ReoWnd time.Duration
// ReoWndIncr is the multiplier applied to adjust reorder window.
ReoWndIncr uint8
// ReoWndPersist is the number of loss recoveries before resetting
// reorder window.
ReoWndPersist int8
// RTTSeq is the SND.NXT when RTT is updated.
RTTSeq seqnum.Value
}
// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
//
// +stateify savable
type TCPEndpointID struct {
// LocalPort is the local port associated with the endpoint.
LocalPort uint16
// LocalAddress is the local [network layer] address associated with
// the endpoint.
LocalAddress tcpip.Address
// RemotePort is the remote port associated with the endpoint.
RemotePort uint16
// RemoteAddress it the remote [network layer] address associated with
// the endpoint.
RemoteAddress tcpip.Address
}
// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
// TCP endpoint.
//
// +stateify savable
type TCPFastRecoveryState struct {
// Active if true indicates the endpoint is in fast recovery. The
// following fields are only meaningful when Active is true.
Active bool
// First is the first unacknowledged sequence number being recovered.
First seqnum.Value
// Last is the 'recover' sequence number that indicates the point at
// which we should exit recovery barring any timeouts etc.
Last seqnum.Value
// MaxCwnd is the maximum value we are permitted to grow the congestion
// window during recovery. This is set at the time we enter recovery.
// It exists to avoid attacks where the receiver intentionally sends
// duplicate acks to artificially inflate the sender's cwnd.
MaxCwnd int
// HighRxt is the highest sequence number which has been retransmitted
// during the current loss recovery phase. See: RFC 6675 Section 2 for
// details.
HighRxt seqnum.Value
// RescueRxt is the highest sequence number which has been
// optimistically retransmitted to prevent stalling of the ACK clock
// when there is loss at the end of the window and no new data is
// available for transmission. See: RFC 6675 Section 2 for details.
RescueRxt seqnum.Value
}
// TCPReceiverState holds a copy of the internal state of the receiver for a
// given TCP endpoint.
//
// +stateify savable
type TCPReceiverState struct {
// RcvNxt is the TCP variable RCV.NXT.
RcvNxt seqnum.Value
// RcvAcc is one beyond the last acceptable sequence number. That is,
// the "largest" sequence value that the receiver has announced to its
// peer that it's willing to accept. This may be different than RcvNxt
// + (last advertised receive window) if the receive window is reduced;
// in that case we have to reduce the window as we receive more data
// instead of shrinking it.
RcvAcc seqnum.Value
// RcvWndScale is the window scaling to use for inbound segments.
RcvWndScale uint8
// PendingBufUsed is the number of bytes pending in the receive queue.
PendingBufUsed int
}
// TCPRTTState holds a copy of information about the endpoint's round trip
// time.
//
// +stateify savable
type TCPRTTState struct {
// SRTT is the smoothed round trip time defined in section 2 of RFC
// 6298.
SRTT time.Duration
// RTTVar is the round-trip time variation as defined in section 2 of
// RFC 6298.
RTTVar time.Duration
// SRTTInited if true indicates that a valid RTT measurement has been
// completed.
SRTTInited bool
}
// TCPSenderState holds a copy of the internal state of the sender for a given
// TCP Endpoint.
//
// +stateify savable
type TCPSenderState struct {
// LastSendTime is the timestamp at which we sent the last segment.
LastSendTime tcpip.MonotonicTime
// DupAckCount is the number of Duplicate ACKs received. It is used for
// fast retransmit.
DupAckCount int
// SndCwnd is the size of the sending congestion window in packets.
SndCwnd int
// Ssthresh is the threshold between slow start and congestion
// avoidance.
Ssthresh int
// SndCAAckCount is the number of packets acknowledged during
// congestion avoidance. When enough packets have been ack'd (typically
// cwnd packets), the congestion window is incremented by one.
SndCAAckCount int
// Outstanding is the number of packets that have been sent but not yet
// acknowledged.
Outstanding int
// SackedOut is the number of packets which have been selectively
// acked.
SackedOut int
// SndWnd is the send window size in bytes.
SndWnd seqnum.Size
// SndUna is the next unacknowledged sequence number.
SndUna seqnum.Value
// SndNxt is the sequence number of the next segment to be sent.
SndNxt seqnum.Value
// RTTMeasureSeqNum is the sequence number being used for the latest
// RTT measurement.
RTTMeasureSeqNum seqnum.Value
// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
RTTMeasureTime tcpip.MonotonicTime
// Closed indicates that the caller has closed the endpoint for
// sending.
Closed bool
// RTO is the retransmit timeout as defined in section of 2 of RFC
// 6298.
RTO time.Duration
// RTTState holds information about the endpoint's round trip time.
RTTState TCPRTTState
// MaxPayloadSize is the maximum size of the payload of a given
// segment. It is initialized on demand.
MaxPayloadSize int
// SndWndScale is the number of bits to shift left when reading the
// send window size from a segment.
SndWndScale uint8
// MaxSentAck is the highest acknowledgement number sent till now.
MaxSentAck seqnum.Value
// FastRecovery holds the fast recovery state for the endpoint.
FastRecovery TCPFastRecoveryState
// Cubic holds the state related to CUBIC congestion control.
Cubic TCPCubicState
// RACKState holds the state related to RACK loss detection algorithm.
RACKState TCPRACKState
// RetransmitTS records the timestamp used to detect spurious recovery.
RetransmitTS uint32
// SpuriousRecovery indicates if the sender entered recovery spuriously.
SpuriousRecovery bool
}
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
//
// +stateify savable
type TCPSACKInfo struct {
// Blocks is the list of SACK Blocks that identify the out of order
// segments held by a given TCP endpoint.
Blocks []header.SACKBlock
// ReceivedBlocks are the SACK blocks received by this endpoint from
// the peer endpoint.
ReceivedBlocks []header.SACKBlock
// MaxSACKED is the highest sequence number that has been SACKED by the
// peer.
MaxSACKED seqnum.Value
}
// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
//
// +stateify savable
type RcvBufAutoTuneParams struct {
// MeasureTime is the time at which the current measurement was
// started.
MeasureTime tcpip.MonotonicTime
// CopiedBytes is the number of bytes copied to user space since this
// measure began.
CopiedBytes int
// PrevCopiedBytes is the number of bytes copied to userspace in the
// previous RTT period.
PrevCopiedBytes int
// RcvBufSize is the auto tuned receive buffer size.
RcvBufSize int
// RTT is the smoothed RTT as measured by observing the time between
// when a byte is first acknowledged and the receipt of data that is at
// least one window beyond the sequence number that was acknowledged.
RTT time.Duration
// RTTVar is the "round-trip time variation" as defined in section 2 of
// RFC6298.
RTTVar time.Duration
// RTTMeasureSeqNumber is the highest acceptable sequence number at the
// time this RTT measurement period began.
RTTMeasureSeqNumber seqnum.Value
// RTTMeasureTime is the absolute time at which the current RTT
// measurement period began.
RTTMeasureTime tcpip.MonotonicTime
// Disabled is true if an explicit receive buffer is set for the
// endpoint.
Disabled bool
}
// TCPRcvBufState contains information about the state of an endpoint's receive
// socket buffer.
//
// +stateify savable
type TCPRcvBufState struct {
// RcvBufUsed is the amount of bytes actually held in the receive
// socket buffer for the endpoint.
RcvBufUsed int
// RcvBufAutoTuneParams is used to hold state variables to compute the
// auto tuned receive buffer size.
RcvAutoParams RcvBufAutoTuneParams
// RcvClosed if true, indicates the endpoint has been closed for
// reading.
RcvClosed bool
}
// TCPSndBufState contains information about the state of an endpoint's send
// socket buffer.
//
// +stateify savable
type TCPSndBufState struct {
// SndBufSize is the size of the socket send buffer.
SndBufSize int
// SndBufUsed is the number of bytes held in the socket send buffer.
SndBufUsed int
// SndClosed indicates that the endpoint has been closed for sends.
SndClosed bool
// PacketTooBigCount is used to notify the main protocol routine how
// many times a "packet too big" control packet is received.
PacketTooBigCount int
// SndMTU is the smallest MTU seen in the control packets received.
SndMTU int
// AutoTuneSndBufDisabled indicates that the auto tuning of send buffer
// is disabled.
AutoTuneSndBufDisabled atomicbitops.Uint32
}
// TCPEndpointStateInner contains the members of TCPEndpointState used directly
// (that is, not within another containing struct) within the endpoint's
// internal implementation.
//
// +stateify savable
type TCPEndpointStateInner struct {
// TSOffset is a randomized offset added to the value of the TSVal
// field in the timestamp option.
TSOffset tcp.TSOffset
// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
// option in the SYN/SYN-ACK.
SACKPermitted bool
// SendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1.
SendTSOk bool
// RecentTS is the timestamp that should be sent in the TSEcr field of
// the timestamp for future segments sent by the endpoint. This field
// is updated if required when a new segment is received by this
// endpoint.
RecentTS uint32
}
// TCPEndpointState is a copy of the internal state of a TCP endpoint.
//
// +stateify savable
type TCPEndpointState struct {
// TCPEndpointStateInner contains the members of TCPEndpointState used
// by the endpoint's internal implementation.
TCPEndpointStateInner
// ID is a copy of the TransportEndpointID for the endpoint.
ID TCPEndpointID
// SegTime denotes the absolute time when this segment was received.
SegTime tcpip.MonotonicTime
// RcvBufState contains information about the state of the endpoint's
// receive socket buffer.
RcvBufState TCPRcvBufState
// SndBufState contains information about the state of the endpoint's
// send socket buffer.
SndBufState TCPSndBufState
// SACK holds TCP SACK related information for this endpoint.
SACK TCPSACKInfo
// Receiver holds variables related to the TCP receiver for the
// endpoint.
Receiver TCPReceiverState
// Sender holds state related to the TCP Sender for the endpoint.
Sender TCPSenderState
}
|