File: snd.go

package info (click to toggle)
golang-gvisor-gvisor 0.0~20221219.0-2
links: PTS, VCS
area: main
in suites: bookworm, bookworm-proposed-updates
size: 17,136 kB
sloc: asm: 2,860; cpp: 348; python: 89; sh: 40; makefile: 34; ansic: 21
file content (1718 lines) | stat: -rw-r--r-- 57,857 bytes
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
	"fmt"
	"math"
	"sort"
	"time"

	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/tcpip"
	"gvisor.dev/gvisor/pkg/tcpip/header"
	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
	"gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
	// MinRTO is the minimum allowed value for the retransmit timeout.
	MinRTO = 200 * time.Millisecond

	// MaxRTO is the maximum allowed value for the retransmit timeout.
	MaxRTO = 120 * time.Second

	// InitialCwnd is the initial congestion window.
	InitialCwnd = 10

	// nDupAckThreshold is the number of duplicate ACK's required
	// before fast-retransmit is entered.
	nDupAckThreshold = 3

	// MaxRetries is the maximum number of probe retries sender does
	// before timing out the connection.
	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
	MaxRetries = 15
)

// congestionControl is an interface that must be implemented by any supported
// congestion control algorithm.
type congestionControl interface {
	// HandleLossDetected is invoked when the loss is detected by RACK or
	// sender.dupAckCount >= nDupAckThreshold just before entering fast
	// retransmit.
	HandleLossDetected()

	// HandleRTOExpired is invoked when the retransmit timer expires.
	HandleRTOExpired()

	// Update is invoked when processing inbound acks. It's passed the
	// number of packet's that were acked by the most recent cumulative
	// acknowledgement.
	Update(packetsAcked int)

	// PostRecovery is invoked when the sender is exiting a fast retransmit/
	// recovery phase. This provides congestion control algorithms a way
	// to adjust their state when exiting recovery.
	PostRecovery()
}

// lossRecovery is an interface that must be implemented by any supported
// loss recovery algorithm.
type lossRecovery interface {
	// DoRecovery is invoked when loss is detected and segments need
	// to be retransmitted. The cumulative or selective ACK is passed along
	// with the flag which identifies whether the connection entered fast
	// retransmit with this ACK and to retransmit the first unacknowledged
	// segment.
	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
}

// sender holds the state necessary to send TCP segments.
//
// +stateify savable
type sender struct {
	stack.TCPSenderState
	ep *endpoint

	// lr is the loss recovery algorithm used by the sender.
	lr lossRecovery

	// firstRetransmittedSegXmitTime is the original transmit time of
	// the first segment that was retransmitted due to RTO expiration.
	firstRetransmittedSegXmitTime tcpip.MonotonicTime

	// zeroWindowProbing is set if the sender is currently probing
	// for zero receive window.
	zeroWindowProbing bool `state:"nosave"`

	// unackZeroWindowProbes is the number of unacknowledged zero
	// window probes.
	unackZeroWindowProbes uint32 `state:"nosave"`

	writeNext   *segment
	writeList   segmentList
	resendTimer timer `state:"nosave"`

	// rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
	// round-trip time", and "round-trip time variation", as defined in
	// section 2 of RFC 6298.
	rtt rtt

	// minRTO is the minimum permitted value for sender.rto.
	minRTO time.Duration

	// maxRTO is the maximum permitted value for sender.rto.
	maxRTO time.Duration

	// maxRetries is the maximum permitted retransmissions.
	maxRetries uint32

	// gso is set if generic segmentation offload is enabled.
	gso bool

	// state is the current state of congestion control for this endpoint.
	state tcpip.CongestionControlState

	// cc is the congestion control algorithm in use for this sender.
	cc congestionControl

	// rc has the fields needed for implementing RACK loss detection
	// algorithm.
	rc rackControl

	// reorderTimer is the timer used to retransmit the segments after RACK
	// detects them as lost.
	reorderTimer timer `state:"nosave"`

	// probeTimer is used to schedule PTO for RACK TLP algorithm.
	probeTimer timer `state:"nosave"`

	// spuriousRecovery indicates whether the sender entered recovery
	// spuriously as described in RFC3522 Section 3.2.
	spuriousRecovery bool

	// retransmitTS is the timestamp at which the sender sends retransmitted
	// segment after entering an RTO for the first time as described in
	// RFC3522 Section 3.2.
	retransmitTS uint32
}

// rtt is a synchronization wrapper used to appease stateify. See the comment
// in sender, where it is used.
//
// +stateify savable
type rtt struct {
	sync.Mutex `state:"nosave"`

	stack.TCPRTTState
}

// +checklocks:ep.mu
func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
	// The sender MUST reduce the TCP data length to account for any IP or
	// TCP options that it is including in the packets that it sends.
	// See: https://tools.ietf.org/html/rfc6691#section-2
	maxPayloadSize := int(mss) - ep.maxOptionSize()

	s := &sender{
		ep: ep,
		TCPSenderState: stack.TCPSenderState{
			SndWnd:           sndWnd,
			SndUna:           iss + 1,
			SndNxt:           iss + 1,
			RTTMeasureSeqNum: iss + 1,
			LastSendTime:     ep.stack.Clock().NowMonotonic(),
			MaxPayloadSize:   maxPayloadSize,
			MaxSentAck:       irs + 1,
			FastRecovery: stack.TCPFastRecoveryState{
				// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
				Last:      iss,
				HighRxt:   iss,
				RescueRxt: iss,
			},
			RTO: 1 * time.Second,
		},
		gso: ep.gso.Type != stack.GSONone,
	}

	if s.gso {
		s.ep.gso.MSS = uint16(maxPayloadSize)
	}

	s.cc = s.initCongestionControl(ep.cc)
	s.lr = s.initLossRecovery()
	s.rc.init(s, iss)

	// A negative sndWndScale means that no scaling is in use, otherwise we
	// store the scaling value.
	if sndWndScale > 0 {
		s.SndWndScale = uint8(sndWndScale)
	}

	s.resendTimer.init(s.ep.stack.Clock(), maybeFailTimerHandler(s.ep, s.retransmitTimerExpired))
	s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired))
	s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired))

	s.ep.AssertLockHeld(ep)
	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
	// Initialize SACK Scoreboard after updating max payload size as we use
	// the maxPayloadSize as the smss when determining if a segment is lost
	// etc.
	s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)

	// Get Stack wide config.
	var minRTO tcpip.TCPMinRTOOption
	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
	}
	s.minRTO = time.Duration(minRTO)

	var maxRTO tcpip.TCPMaxRTOOption
	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
	}
	s.maxRTO = time.Duration(maxRTO)

	var maxRetries tcpip.TCPMaxRetriesOption
	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
	}
	s.maxRetries = uint32(maxRetries)

	return s
}

// initCongestionControl initializes the specified congestion control module and
// returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
// their initial values.
func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
	s.SndCwnd = InitialCwnd
	// Set sndSsthresh to the maximum int value, which depends on the
	// platform.
	s.Ssthresh = int(^uint(0) >> 1)

	switch congestionControlName {
	case ccCubic:
		return newCubicCC(s)
	case ccReno:
		fallthrough
	default:
		return newRenoCC(s)
	}
}

// initLossRecovery initiates the loss recovery algorithm for the sender.
func (s *sender) initLossRecovery() lossRecovery {
	if s.ep.SACKPermitted {
		return newSACKRecovery(s)
	}
	return newRenoRecovery(s)
}

// updateMaxPayloadSize updates the maximum payload size based on the given
// MTU. If this is in response to "packet too big" control packets (indicated
// by the count argument), it also reduces the number of outstanding packets and
// attempts to retransmit the first packet above the MTU size.
// +checklocks:s.ep.mu
func (s *sender) updateMaxPayloadSize(mtu, count int) {
	m := mtu - header.TCPMinimumSize

	m -= s.ep.maxOptionSize()

	// We don't adjust up for now.
	if m >= s.MaxPayloadSize {
		return
	}

	// Make sure we can transmit at least one byte.
	if m <= 0 {
		m = 1
	}

	oldMSS := s.MaxPayloadSize
	s.MaxPayloadSize = m
	if s.gso {
		s.ep.gso.MSS = uint16(m)
	}

	if count == 0 {
		// updateMaxPayloadSize is also called when the sender is created.
		// and there is no data to send in such cases. Return immediately.
		return
	}

	// Update the scoreboard's smss to reflect the new lowered
	// maxPayloadSize.
	s.ep.scoreboard.smss = uint16(m)

	s.Outstanding -= count
	if s.Outstanding < 0 {
		s.Outstanding = 0
	}

	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
	// if it is already before such a packet.
	nextSeg := s.writeNext
	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
		if seg == s.writeNext {
			// We got to writeNext before we could find a segment
			// exceeding the MTU.
			break
		}

		if nextSeg == s.writeNext && seg.payloadSize() > m {
			// We found a segment exceeding the MTU. Rewind
			// writeNext and try to retransmit it.
			nextSeg = seg
		}

		if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
			// Update sackedOut for new maximum payload size.
			s.SackedOut -= s.pCount(seg, oldMSS)
			s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
		}
	}

	// Since we likely reduced the number of outstanding packets, we may be
	// ready to send some more.
	s.updateWriteNext(nextSeg)
	s.sendData()
}

// sendAck sends an ACK segment.
// +checklocks:s.ep.mu
func (s *sender) sendAck() {
	s.sendEmptySegment(header.TCPFlagAck, s.SndNxt)
}

// updateRTO updates the retransmit timeout when a new roud-trip time is
// available. This is done in accordance with section 2 of RFC 6298.
func (s *sender) updateRTO(rtt time.Duration) {
	s.rtt.Lock()
	if !s.rtt.TCPRTTState.SRTTInited {
		s.rtt.TCPRTTState.RTTVar = rtt / 2
		s.rtt.TCPRTTState.SRTT = rtt
		s.rtt.TCPRTTState.SRTTInited = true
	} else {
		diff := s.rtt.TCPRTTState.SRTT - rtt
		if diff < 0 {
			diff = -diff
		}
		// Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
		// no timestamps are available.
		if !s.ep.SendTSOk {
			s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
			s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
		} else {
			// When we are taking RTT measurements of every ACK then
			// we need to use a modified method as specified in
			// https://tools.ietf.org/html/rfc7323#appendix-G
			if s.Outstanding == 0 {
				s.rtt.Unlock()
				return
			}
			// Netstack measures congestion window/inflight all in
			// terms of packets and not bytes. This is similar to
			// how linux also does cwnd and inflight. In practice
			// this approximation works as expected.
			expectedSamples := math.Ceil(float64(s.Outstanding) / 2)

			// alpha & beta values are the original values as recommended in
			// https://tools.ietf.org/html/rfc6298#section-2.3.
			const alpha = 0.125
			const beta = 0.25

			alphaPrime := alpha / expectedSamples
			betaPrime := beta / expectedSamples
			rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
			srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
			s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
			s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
		}
	}

	s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
	s.rtt.Unlock()
	if s.RTO < s.minRTO {
		s.RTO = s.minRTO
	}
	if s.RTO > s.maxRTO {
		s.RTO = s.maxRTO
	}
}

// resendSegment resends the first unacknowledged segment.
// +checklocks:s.ep.mu
func (s *sender) resendSegment() {
	// Don't use any segments we already sent to measure RTT as they may
	// have been affected by packets being lost.
	s.RTTMeasureSeqNum = s.SndNxt

	// Resend the segment.
	if seg := s.writeList.Front(); seg != nil {
		if seg.payloadSize() > s.MaxPayloadSize {
			s.splitSeg(seg, s.MaxPayloadSize)
		}

		// See: RFC 6675 section 5 Step 4.3
		//
		// To prevent retransmission, set both the HighRXT and RescueRXT
		// to the highest sequence number in the retransmitted segment.
		s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
		s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1
		s.sendSegment(seg)
		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
		s.ep.stats.SendErrors.FastRetransmit.Increment()

		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
		s.SetPipe()
	}
}

// retransmitTimerExpired is called when the retransmit timer expires, and
// unacknowledged segments are assumed lost, and thus need to be resent.
// Returns true if the connection is still usable, or false if the connection
// is deemed lost.
// +checklocks:s.ep.mu
func (s *sender) retransmitTimerExpired() tcpip.Error {
	// Check if the timer actually expired or if it's a spurious wake due
	// to a previously orphaned runtime timer.
	if s.resendTimer.isZero() || !s.resendTimer.checkExpiration() {
		return nil
	}

	// Initialize the variables used to detect spurious recovery after
	// entering RTO.
	//
	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
	s.spuriousRecovery = false
	s.retransmitTS = 0

	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
	// when writeList is empty. Remove this once we have a proper fix for this
	// issue.
	if s.writeList.Front() == nil {
		return nil
	}

	s.ep.stack.Stats().TCP.Timeouts.Increment()
	s.ep.stats.SendErrors.Timeouts.Increment()

	// Set TLPRxtOut to false according to
	// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
	s.rc.tlpRxtOut = false

	// Give up if we've waited more than a minute since the last resend or
	// if a user time out is set and we have exceeded the user specified
	// timeout since the first retransmission.
	uto := s.ep.userTimeout

	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
		// We store the original xmitTime of the segment that we are
		// about to retransmit as the retransmission time. This is
		// required as by the time the retransmitTimer has expired the
		// segment has already been sent and unacked for the RTO at the
		// time the segment was sent.
		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
	}

	elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
	remaining := s.maxRTO
	if uto != 0 {
		// Cap to the user specified timeout if one is specified.
		remaining = uto - elapsed
	}

	// Always honor the user-timeout irrespective of whether the zero
	// window probes were acknowledged.
	// net/ipv4/tcp_timer.c::tcp_probe_timer()
	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
		return &tcpip.ErrTimeout{}
	}

	// Set new timeout. The timer will be restarted by the call to sendData
	// below.
	s.RTO *= 2
	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
	if s.RTO > s.maxRTO {
		s.RTO = s.maxRTO
	}

	// Cap RTO to remaining time.
	if s.RTO > remaining {
		s.RTO = remaining
	}

	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
	//
	// Retransmit timeouts:
	//     After a retransmit timeout, record the highest sequence number
	//     transmitted in the variable recover, and exit the fast recovery
	//     procedure if applicable.
	s.FastRecovery.Last = s.SndNxt - 1

	if s.FastRecovery.Active {
		// We were attempting fast recovery but were not successful.
		// Leave the state. We don't need to update ssthresh because it
		// has already been updated when entered fast-recovery.
		s.leaveRecovery()
	}

	// Record retransmitTS if the sender is not in recovery as per:
	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
	s.recordRetransmitTS()

	s.state = tcpip.RTORecovery
	s.cc.HandleRTOExpired()

	// Mark the next segment to be sent as the first unacknowledged one and
	// start sending again. Set the number of outstanding packets to 0 so
	// that we'll be able to retransmit.
	//
	// We'll keep on transmitting (or retransmitting) as we get acks for
	// the data we transmit.
	s.Outstanding = 0

	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
	//
	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
	//  discard data that has already been selectively acknowledged. As a
	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
	//  information gathered from a receiver upon a retransmission timeout
	//  (RTO) "since the timeout might indicate that the data receiver has
	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
	//  information in determining which data to retransmit."
	//
	// NOTE: We take the stricter interpretation and just expunge all
	// information as we lack more rigorous checks to validate if the SACK
	// information is usable after an RTO.
	s.ep.scoreboard.Reset()
	s.updateWriteNext(s.writeList.Front())

	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
	// zero receive window after retransmission interval and we have data to
	// send.
	if s.zeroWindowProbing {
		s.sendZeroWindowProbe()
		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
		// indefinitely.  As long as the receiving TCP continues to send
		// acknowledgments in response to the probe segments, the sending TCP
		// MUST allow the connection to stay open.
		return nil
	}

	seg := s.writeNext
	// RFC 1122 4.2.3.5: Close the connection when the number of
	// retransmissions for this segment is beyond a limit.
	if seg != nil && seg.xmitCount > s.maxRetries {
		s.ep.stack.Stats().TCP.EstablishedTimedout.Increment()
		return &tcpip.ErrTimeout{}
	}

	s.sendData()

	return nil
}

// pCount returns the number of packets in the segment. Due to GSO, a segment
// can be composed of multiple packets.
func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
	size := seg.payloadSize()
	if size == 0 {
		return 1
	}

	return (size-1)/maxPayloadSize + 1
}

// splitSeg splits a given segment at the size specified and inserts the
// remainder as a new segment after the current one in the write list.
func (s *sender) splitSeg(seg *segment, size int) {
	if seg.payloadSize() <= size {
		return
	}
	// Split this segment up.
	nSeg := seg.clone()
	nSeg.pkt.Data().TrimFront(size)
	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
	s.writeList.InsertAfter(seg, nSeg)

	// The segment being split does not carry PUSH flag because it is
	// followed by the newly split segment.
	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
	// segment (i.e., when there is no more queued data to be sent).
	// Linux removes PSH flag only when the segment is being split over MSS
	// and retains it when we are splitting the segment over lack of sender
	// window space.
	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
	if seg.payloadSize() > s.MaxPayloadSize {
		seg.flags ^= header.TCPFlagPsh
	}
	seg.pkt.Data().CapLength(size)
}

// NextSeg implements the RFC6675 NextSeg() operation.
//
// NextSeg starts scanning the writeList starting from nextSegHint and returns
// the hint to be passed on the next call to NextSeg. This is required to avoid
// iterating the write list repeatedly when NextSeg is invoked in a loop during
// recovery. The returned hint will be nil if there are no more segments that
// can match rules defined by NextSeg operation in RFC6675.
//
// rescueRtx will be true only if nextSeg is a rescue retransmission as
// described by Step 4) of the NextSeg algorithm.
func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
	var s3 *segment
	var s4 *segment
	// Step 1.
	for seg := nextSegHint; seg != nil; seg = seg.Next() {
		// Stop iteration if we hit a segment that has never been
		// transmitted (i.e. either it has no assigned sequence number
		// or if it does have one, it's >= the next sequence number
		// to be sent [i.e. >= s.sndNxt]).
		if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
			hint = nil
			break
		}
		segSeq := seg.sequenceNumber
		if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) {
			s.splitSeg(seg, int(smss))
		}

		// See RFC 6675 Section 4
		//
		//     1. If there exists a smallest unSACKED sequence number
		//     'S2' that meets the following 3 criteria for determinig
		//     loss, the sequence range of one segment of up to SMSS
		//     octects starting with S2 MUST be returned.
		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
			// NextSeg():
			//
			//    (1.a) S2 is greater than HighRxt
			//    (1.b) S2 is less than highest octect covered by
			//    any received SACK.
			if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
				// NextSeg():
				//     (1.c) IsLost(S2) returns true.
				if s.ep.scoreboard.IsLost(segSeq) {
					return seg, seg.Next(), false
				}

				// NextSeg():
				//
				// (3): If the conditions for rules (1) and (2)
				// fail, but there exists an unSACKed sequence
				// number S3 that meets the criteria for
				// detecting loss given in steps 1.a and 1.b
				// above (specifically excluding (1.c)) then one
				// segment of upto SMSS octets starting with S3
				// SHOULD be returned.
				if s3 == nil {
					s3 = seg
					hint = seg.Next()
				}
			}
			// NextSeg():
			//
			//     (4) If the conditions for (1), (2) and (3) fail,
			//     but there exists outstanding unSACKED data, we
			//     provide the opportunity for a single "rescue"
			//     retransmission per entry into loss recovery. If
			//     HighACK is greater than RescueRxt (or RescueRxt
			//     is undefined), then one segment of upto SMSS
			//     octects that MUST include the highest outstanding
			//     unSACKed sequence number SHOULD be returned, and
			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
			//     be updated.
			if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
				if s4 != nil {
					if s4.sequenceNumber.LessThan(segSeq) {
						s4 = seg
					}
				} else {
					s4 = seg
				}
			}
		}
	}

	// If we got here then no segment matched step (1).
	// Step (2): "If no sequence number 'S2' per rule (1)
	// exists but there exists available unsent data and the
	// receiver's advertised window allows, the sequence
	// range of one segment of up to SMSS octets of
	// previously unsent data starting with sequence number
	// HighData+1 MUST be returned."
	for seg := s.writeNext; seg != nil; seg = seg.Next() {
		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
			continue
		}
		// We do not split the segment here to <= smss as it has
		// potentially not been assigned a sequence number yet.
		return seg, nil, false
	}

	if s3 != nil {
		return s3, hint, false
	}

	return s4, nil, true
}

// maybeSendSegment tries to send the specified segment and either coalesces
// other segments into this one or splits the specified segment based on the
// lower of the specified limit value or the receivers window size specified by
// end.
// +checklocks:s.ep.mu
func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
	// We abuse the flags field to determine if we have already
	// assigned a sequence number to this segment.
	if !s.isAssignedSequenceNumber(seg) {
		// Merge segments if allowed.
		if seg.payloadSize() != 0 {
			available := int(s.SndNxt.Size(end))
			if available > limit {
				available = limit
			}

			// nextTooBig indicates that the next segment was too
			// large to entirely fit in the current segment. It
			// would be possible to split the next segment and merge
			// the portion that fits, but unexpectedly splitting
			// segments can have user visible side-effects which can
			// break applications. For example, RFC 7766 section 8
			// says that the length and data of a DNS response
			// should be sent in the same TCP segment to avoid
			// triggering bugs in poorly written DNS
			// implementations.
			var nextTooBig bool
			for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() {
				if seg.payloadSize()+nSeg.payloadSize() > available {
					nextTooBig = true
					break
				}
				seg.merge(nSeg)
				s.writeList.Remove(nSeg)
				nSeg.DecRef()
			}
			if !nextTooBig && seg.payloadSize() < available {
				// Segment is not full.
				if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
					// Nagle's algorithm. From Wikipedia:
					//   Nagle's algorithm works by
					//   combining a number of small
					//   outgoing messages and sending them
					//   all at once. Specifically, as long
					//   as there is a sent packet for which
					//   the sender has received no
					//   acknowledgment, the sender should
					//   keep buffering its output until it
					//   has a full packet's worth of
					//   output, thus allowing output to be
					//   sent all at once.
					return false
				}
				// With TCP_CORK, hold back until minimum of the available
				// send space and MSS.
				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
				// timeout.
				if seg.payloadSize() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() {
					return false
				}
			}
		}

		// Assign flags. We don't do it above so that we can merge
		// additional data if Nagle holds the segment.
		seg.sequenceNumber = s.SndNxt
		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
	}

	var segEnd seqnum.Value
	if seg.payloadSize() == 0 {
		if s.writeList.Back() != seg {
			panic("FIN segments must be the final segment in the write list.")
		}
		seg.flags = header.TCPFlagAck | header.TCPFlagFin
		segEnd = seg.sequenceNumber.Add(1)
		// Update the state to reflect that we have now
		// queued a FIN.
		switch s.ep.EndpointState() {
		case StateCloseWait:
			s.ep.setEndpointState(StateLastAck)
		default:
			s.ep.setEndpointState(StateFinWait1)
		}
	} else {
		// We're sending a non-FIN segment.
		if seg.flags&header.TCPFlagFin != 0 {
			panic("Netstack queues FIN segments without data.")
		}

		if !seg.sequenceNumber.LessThan(end) {
			return false
		}

		available := int(seg.sequenceNumber.Size(end))
		if available == 0 {
			return false
		}

		// If the whole segment or at least 1MSS sized segment cannot
		// be accomodated in the receiver advertized window, skip
		// splitting and sending of the segment. ref:
		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
		//
		// Linux checks this for all segment transmits not triggered by
		// a probe timer. On this condition, it defers the segment split
		// and transmit to a short probe timer.
		//
		// ref: include/net/tcp.h::tcp_check_probe_timer()
		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
		//
		// Instead of defining a new transmit timer, we attempt to split
		// the segment right here if there are no pending segments. If
		// there are pending segments, segment transmits are deferred to
		// the retransmit timer handler.
		if s.SndUna != s.SndNxt {
			switch {
			case available >= seg.payloadSize():
				// OK to send, the whole segments fits in the
				// receiver's advertised window.
			case available >= s.MaxPayloadSize:
				// OK to send, at least 1 MSS sized segment fits
				// in the receiver's advertised window.
			default:
				return false
			}
		}

		// The segment size limit is computed as a function of sender
		// congestion window and MSS. When sender congestion window is >
		// 1, this limit can be larger than MSS. Ensure that the
		// currently available send space is not greater than minimum of
		// this limit and MSS.
		if available > limit {
			available = limit
		}

		// If GSO is not in use then cap available to
		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
		// the host GSO logic will cap the segment to the correct size.
		if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
			available = s.MaxPayloadSize
		}

		if seg.payloadSize() > available {
			// A negative value causes splitSeg to panic anyways, so just panic
			// earlier to get more information about the cause.
			s.splitSeg(seg, available)
		}

		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize()))
	}

	s.sendSegment(seg)

	// Update sndNxt if we actually sent new data (as opposed to
	// retransmitting some previously sent data).
	if s.SndNxt.LessThan(segEnd) {
		s.SndNxt = segEnd
	}

	return true
}

// +checklocks:s.ep.mu
func (s *sender) sendZeroWindowProbe() {
	s.unackZeroWindowProbes++
	// Send a zero window probe with sequence number pointing to
	// the last acknowledged byte.
	s.sendEmptySegment(header.TCPFlagAck, s.SndUna-1)
	// Rearm the timer to continue probing.
	s.resendTimer.enable(s.RTO)
}

func (s *sender) enableZeroWindowProbing() {
	s.zeroWindowProbing = true
	// We piggyback the probing on the retransmit timer with the
	// current retranmission interval, as we may start probing while
	// segment retransmissions.
	if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
		s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
	}
	s.resendTimer.enable(s.RTO)
}

func (s *sender) disableZeroWindowProbing() {
	s.zeroWindowProbing = false
	s.unackZeroWindowProbes = 0
	s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
	s.resendTimer.disable()
}

func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
	if dataSent {
		// We sent data, so we should stop the keepalive timer to ensure
		// that no keepalives are sent while there is pending data.
		s.ep.disableKeepaliveTimer()
	}

	// If the sender has advertized zero receive window and we have
	// data to be sent out, start zero window probing to query the
	// the remote for it's receive window size.
	if s.writeNext != nil && s.SndWnd == 0 {
		s.enableZeroWindowProbing()
	}

	// If we have no more pending data, start the keepalive timer.
	if s.SndUna == s.SndNxt {
		s.ep.resetKeepaliveTimer(false)
	} else {
		// Enable timers if we have pending data.
		if shouldScheduleProbe && s.shouldSchedulePTO() {
			// Schedule PTO after transmitting new data that wasn't itself a TLP probe.
			s.schedulePTO()
		} else if !s.resendTimer.enabled() {
			s.probeTimer.disable()
			if s.Outstanding > 0 {
				// Enable the resend timer if it's not enabled yet and there is
				// outstanding data.
				s.resendTimer.enable(s.RTO)
			}
		}
	}
}

// sendData sends new data segments. It is called when data becomes available or
// when the send window opens up.
// +checklocks:s.ep.mu
func (s *sender) sendData() {
	limit := s.MaxPayloadSize
	if s.gso {
		limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
	}
	end := s.SndUna.Add(s.SndWnd)

	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
	// "A TCP SHOULD set cwnd to no more than RW before beginning
	// transmission if the TCP has not sent data in the interval exceeding
	// the retrasmission timeout."
	if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
		if s.SndCwnd > InitialCwnd {
			s.SndCwnd = InitialCwnd
		}
	}

	var dataSent bool
	for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
		cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
		if cwndLimit < limit {
			limit = cwndLimit
		}
		if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
			// Move writeNext along so that we don't try and scan data that
			// has already been SACKED.
			s.updateWriteNext(seg.Next())
			continue
		}
		if sent := s.maybeSendSegment(seg, limit, end); !sent {
			break
		}
		dataSent = true
		s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
		s.updateWriteNext(seg.Next())
	}

	s.postXmit(dataSent, true /* shouldScheduleProbe */)
}

func (s *sender) enterRecovery() {
	// Initialize the variables used to detect spurious recovery after
	// entering recovery.
	//
	// See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1.
	s.spuriousRecovery = false
	s.retransmitTS = 0

	s.FastRecovery.Active = true
	// Save state to reflect we're now in fast recovery.
	//
	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
	// We inflate the cwnd by 3 to account for the 3 packets which triggered
	// the 3 duplicate ACKs and are now not in flight.
	s.SndCwnd = s.Ssthresh + 3
	s.SackedOut = 0
	s.DupAckCount = 0
	s.FastRecovery.First = s.SndUna
	s.FastRecovery.Last = s.SndNxt - 1
	s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
	s.FastRecovery.HighRxt = s.SndUna
	s.FastRecovery.RescueRxt = s.SndUna

	// Record retransmitTS if the sender is not in recovery as per:
	// https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
	s.recordRetransmitTS()

	if s.ep.SACKPermitted {
		s.state = tcpip.SACKRecovery
		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
		// Set TLPRxtOut to false according to
		// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
		if s.rc.tlpRxtOut {
			// The tail loss probe triggered recovery.
			s.ep.stack.Stats().TCP.TLPRecovery.Increment()
		}
		s.rc.tlpRxtOut = false
		return
	}
	s.state = tcpip.FastRecovery
	s.ep.stack.Stats().TCP.FastRecovery.Increment()
}

func (s *sender) leaveRecovery() {
	s.FastRecovery.Active = false
	s.FastRecovery.MaxCwnd = 0
	s.DupAckCount = 0

	// Deflate cwnd. It had been artificially inflated when new dups arrived.
	s.SndCwnd = s.Ssthresh
	s.cc.PostRecovery()
}

// isAssignedSequenceNumber relies on the fact that we only set flags once a
// sequencenumber is assigned and that is only done right before we send the
// segment. As a result any segment that has a non-zero flag has a valid
// sequence number assigned to it.
func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
	return seg.flags != 0
}

// SetPipe implements the SetPipe() function described in RFC6675. Netstack
// maintains the congestion window in number of packets and not bytes, so
// SetPipe() here measures number of outstanding packets rather than actual
// outstanding bytes in the network.
func (s *sender) SetPipe() {
	// If SACK isn't permitted or it is permitted but recovery is not active
	// then ignore pipe calculations.
	if !s.ep.SACKPermitted || !s.FastRecovery.Active {
		return
	}
	pipe := 0
	smss := seqnum.Size(s.ep.scoreboard.SMSS())
	for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
		// With GSO each segment can be much larger than SMSS. So check the segment
		// in SMSS sized ranges.
		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize()))
		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
			endSeq := startSeq.Add(smss)
			if segEnd.LessThan(endSeq) {
				endSeq = segEnd
			}
			sb := header.SACKBlock{Start: startSeq, End: endSeq}
			// SetPipe():
			//
			// After initializing pipe to zero, the following steps are
			// taken for each octet 'S1' in the sequence space between
			// HighACK and HighData that has not been SACKed:
			if !s1.sequenceNumber.LessThan(s.SndNxt) {
				break
			}
			if s.ep.scoreboard.IsSACKED(sb) {
				continue
			}

			// SetPipe():
			//
			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
			//
			// NOTE: here we mark the whole segment as lost. We do not try
			// and test every byte in our write buffer as we maintain our
			// pipe in terms of oustanding packets and not bytes.
			if !s.ep.scoreboard.IsRangeLost(sb) {
				pipe++
			}
			// SetPipe():
			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
			if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
				pipe++
			}
		}
	}
	s.Outstanding = pipe
}

// shouldEnterRecovery returns true if the sender should enter fast recovery
// based on dupAck count and sack scoreboard.
// See RFC 6675 section 5.
func (s *sender) shouldEnterRecovery() bool {
	return s.DupAckCount >= nDupAckThreshold ||
		(s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
}

// detectLoss is called when an ack is received and returns whether a loss is
// detected. It manages the state related to duplicate acks and determines if
// a retransmit is needed according to the rules in RFC 6582 (NewReno).
func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
	// We're not in fast recovery yet.

	// If RACK is enabled and there is no reordering we should honor the
	// three duplicate ACK rule to enter recovery.
	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
		if s.rc.Reord {
			return false
		}
	}

	if !s.isDupAck(seg) {
		s.DupAckCount = 0
		return false
	}

	s.DupAckCount++

	// Do not enter fast recovery until we reach nDupAckThreshold or the
	// first unacknowledged byte is considered lost as per SACK scoreboard.
	if !s.shouldEnterRecovery() {
		// RFC 6675 Step 3.
		s.FastRecovery.HighRxt = s.SndUna - 1
		// Do run SetPipe() to calculate the outstanding segments.
		s.SetPipe()
		s.state = tcpip.Disorder
		return false
	}

	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
	//
	// We only do the check here, the incrementing of last to the highest
	// sequence number transmitted till now is done when enterRecovery
	// is invoked.
	//
	// Note that we only enter recovery when at least one more byte of data
	// beyond s.fr.last (the highest byte that was outstanding when fast
	// retransmit was last entered) is acked.
	if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
		s.DupAckCount = 0
		return false
	}
	s.cc.HandleLossDetected()
	s.enterRecovery()
	return true
}

// isDupAck determines if seg is a duplicate ack as defined in
// https://tools.ietf.org/html/rfc5681#section-2.
func (s *sender) isDupAck(seg *segment) bool {
	// A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
	// can leverage the SACK information to determine when an incoming ACK is a
	// "duplicate" (e.g., if the ACK contains previously unknown SACK
	// information).
	if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
		return false
	}

	// (a) The receiver of the ACK has outstanding data.
	return s.SndUna != s.SndNxt &&
		// (b) The incoming acknowledgment carries no data.
		seg.logicalLen() == 0 &&
		// (c) The SYN and FIN bits are both off.
		!seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
		// (d) the ACK number is equal to the greatest acknowledgment received on
		// the given connection (TCP.UNA from RFC793).
		seg.ackNumber == s.SndUna &&
		// (e) the advertised window in the incoming acknowledgment equals the
		// advertised window in the last incoming acknowledgment.
		s.SndWnd == seg.window
}

// Iterate the writeList and update RACK for each segment which is newly acked
// either cumulatively or selectively. Loop through the segments which are
// sacked, and update the RACK related variables and check for reordering.
// Returns true when the DSACK block has been detected in the received ACK.
//
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// steps 2 and 3.
func (s *sender) walkSACK(rcvdSeg *segment) bool {
	s.rc.setDSACKSeen(false)

	// Look for DSACK block.
	hasDSACK := false
	idx := 0
	n := len(rcvdSeg.parsedOptions.SACKBlocks)
	if checkDSACK(rcvdSeg) {
		dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0]
		numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize)
		// numDSACK can be zero when DSACK is sent for subsegments.
		if numDSACK < 1 {
			numDSACK = 1
		}
		s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK)
		s.rc.setDSACKSeen(true)
		idx = 1
		n--
		hasDSACK = true
	}

	if n == 0 {
		return hasDSACK
	}

	// Sort the SACK blocks. The first block is the most recent unacked
	// block. The following blocks can be in arbitrary order.
	sackBlocks := make([]header.SACKBlock, n)
	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
	sort.Slice(sackBlocks, func(i, j int) bool {
		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
	})

	seg := s.writeList.Front()
	for _, sb := range sackBlocks {
		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
				s.rc.update(seg, rcvdSeg)
				s.rc.detectReorder(seg)
				seg.acked = true
				s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
			}
			seg = seg.Next()
		}
	}
	return hasDSACK
}

// checkDSACK checks if a DSACK is reported.
func checkDSACK(rcvdSeg *segment) bool {
	n := len(rcvdSeg.parsedOptions.SACKBlocks)
	if n == 0 {
		return false
	}

	sb := rcvdSeg.parsedOptions.SACKBlocks[0]
	// Check if SACK block is invalid.
	if sb.End.LessThan(sb.Start) {
		return false
	}

	// See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
	// at most one SACK block. DSACK is detected in the below two cases:
	//	* If the SACK sequence space is less than this cumulative ACK, it is
	//		an indication that the segment identified by the SACK block has
	//		been received more than once by the receiver.
	//	* If the sequence space in the first SACK block is greater than the
	//		cumulative ACK, then the sender next compares the sequence space
	//		in the first SACK block with the sequence space in the second SACK
	//		block, if there is one. This comparison can determine if the first
	//		SACK block is reporting duplicate data that lies above the
	//		cumulative ACK.
	if sb.Start.LessThan(rcvdSeg.ackNumber) {
		return true
	}

	if n > 1 {
		sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
		if sb1.End.LessThan(sb1.Start) {
			return false
		}

		// If the first SACK block is fully covered by second SACK
		// block, then the first block is a DSACK block.
		if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
			return true
		}
	}

	return false
}

func (s *sender) recordRetransmitTS() {
	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2
	//
	// The Eifel detection algorithm is used, only upon initiation of loss
	// recovery, i.e., when either the timeout-based retransmit or the fast
	// retransmit is sent. The Eifel detection algorithm MUST NOT be
	// reinitiated after loss recovery has already started. In particular,
	// it must not be reinitiated upon subsequent timeouts for the same
	// segment, and not upon retransmitting segments other than the oldest
	// outstanding segment, e.g., during selective loss recovery.
	if s.inRecovery() {
		return
	}

	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2
	//
	// Set a "RetransmitTS" variable to the value of the Timestamp Value
	// field of the Timestamps option included in the retransmit sent when
	// loss recovery is initiated. A TCP sender must ensure that
	// RetransmitTS does not get overwritten as loss recovery progresses,
	// e.g., in case of a second timeout and subsequent second retransmit of
	// the same octet.
	s.retransmitTS = s.ep.tsValNow()
}

func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) {
	// Return if the sender has already detected spurious recovery.
	if s.spuriousRecovery {
		return
	}

	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4
	//
	// If the value of the Timestamp Echo Reply field of the acceptable ACK's
	// Timestamps option is smaller than the value of RetransmitTS, then
	// proceed to next step, else return.
	if tsEchoReply >= s.retransmitTS {
		return
	}

	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
	//
	// If the acceptable ACK carries a DSACK option [RFC2883], then return.
	if hasDSACK {
		return
	}

	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5
	//
	// If during the lifetime of the TCP connection the TCP sender has
	// previously received an ACK with a DSACK option, or the acceptable ACK
	// does not acknowledge all outstanding data, then proceed to next step,
	// else return.
	numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value()
	if numDSACK == 0 && s.SndUna == s.SndNxt {
		return
	}

	// See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6
	//
	// If the loss recovery has been initiated with a timeout-based
	// retransmit, then set
	//    SpuriousRecovery <- SPUR_TO (equal 1),
	// else set
	//    SpuriousRecovery <- dupacks+1
	// Set the spurious recovery variable to true as we do not differentiate
	// between fast, SACK or RTO recovery.
	s.spuriousRecovery = true
	s.ep.stack.Stats().TCP.SpuriousRecovery.Increment()

	// RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and
	// timeout). Increment the metric for RTO only as we want to track the
	// number of timeout recoveries.
	if s.state == tcpip.RTORecovery {
		s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment()
	}
}

// Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state.
func (s *sender) inRecovery() bool {
	if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery {
		return true
	}
	return false
}

// handleRcvdSegment is called when a segment is received; it is responsible for
// updating the send-related state.
// +checklocks:s.ep.mu
// +checklocksalias:s.rc.snd.ep.mu=s.ep.mu
func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
	// Check if we can extract an RTT measurement from this ack.
	if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
		s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
		s.RTTMeasureSeqNum = s.SndNxt
	}

	// Update Timestamp if required. See RFC7323, section-4.3.
	if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
	}

	// Insert SACKBlock information into our scoreboard.
	hasDSACK := false
	if s.ep.SACKPermitted {
		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
			// Only insert the SACK block if the following holds
			// true:
			//  * SACK block acks data after the ack number in the
			//    current segment.
			//  * SACK block represents a sequence
			//    between sndUna and sndNxt (i.e. data that is
			//    currently unacked and in-flight).
			//  * SACK block that has not been SACKed already.
			//
			// NOTE: This check specifically excludes DSACK blocks
			// which have start/end before sndUna and are used to
			// indicate spurious retransmissions.
			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
				s.ep.scoreboard.Insert(sb)
				rcvdSeg.hasNewSACKInfo = true
			}
		}

		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
		// section-7.2
		//	* Step 2: Update RACK stats.
		//		If the ACK is not ignored as invalid, update the RACK.rtt
		//		to be the RTT sample calculated using this ACK, and
		//		continue.  If this ACK or SACK was for the most recently
		//		sent packet, then record the RACK.xmit_ts timestamp and
		//		RACK.end_seq sequence implied by this ACK.
		//	* Step 3: Detect packet reordering.
		//		If the ACK selectively or cumulatively acknowledges an
		//		unacknowledged and also never retransmitted sequence below
		//		RACK.fack, then the corresponding packet has been
		//		reordered and RACK.reord is set to TRUE.
		if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
			hasDSACK = s.walkSACK(rcvdSeg)
		}
		s.SetPipe()
	}

	ack := rcvdSeg.ackNumber
	fastRetransmit := false
	// Do not leave fast recovery, if the ACK is out of range.
	if s.FastRecovery.Active {
		// Leave fast recovery if it acknowledges all the data covered by
		// this fast recovery session.
		if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
			s.leaveRecovery()
		}
	} else {
		// Detect loss by counting the duplicates and enter recovery.
		fastRetransmit = s.detectLoss(rcvdSeg)
	}

	// See if TLP based recovery was successful.
	if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
		s.detectTLPRecovery(ack, rcvdSeg)
	}

	// Stash away the current window size.
	s.SndWnd = rcvdSeg.window

	// Disable zero window probing if remote advertizes a non-zero receive
	// window. This can be with an ACK to the zero window probe (where the
	// acknumber refers to the already acknowledged byte) OR to any previously
	// unacknowledged segment.
	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
		(ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
		s.disableZeroWindowProbing()
	}

	// On receiving the ACK for the zero window probe, account for it and
	// skip trying to send any segment as we are still probing for
	// receive window to become non-zero.
	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
		s.unackZeroWindowProbes--
		return
	}

	// Ignore ack if it doesn't acknowledge any new data.
	if (ack - 1).InRange(s.SndUna, s.SndNxt) {
		s.DupAckCount = 0

		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
		// Specifically we should only update the RTO using TSEcr if the
		// following condition holds:
		//
		//    A TSecr value received in a segment is used to update the
		//    averaged RTT measurement only if the segment acknowledges
		//    some new data, i.e., only if it advances the left edge of
		//    the send window.
		if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
			s.updateRTO(s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr))
		}

		if s.shouldSchedulePTO() {
			// Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
			// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
			s.schedulePTO()
		} else {
			// When an ack is received we must rearm the timer.
			// RFC 6298 5.3
			s.probeTimer.disable()
			s.resendTimer.enable(s.RTO)
		}

		// Remove all acknowledged data from the write list.
		acked := s.SndUna.Size(ack)
		s.SndUna = ack
		ackLeft := acked
		originalOutstanding := s.Outstanding
		for ackLeft > 0 {
			// We use logicalLen here because we can have FIN
			// segments (which are always at the end of list) that
			// have no data, but do consume a sequence number.
			seg := s.writeList.Front()
			datalen := seg.logicalLen()

			if datalen > ackLeft {
				prevCount := s.pCount(seg, s.MaxPayloadSize)
				seg.TrimFront(ackLeft)
				seg.sequenceNumber.UpdateForward(ackLeft)
				s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
				break
			}

			if s.writeNext == seg {
				s.updateWriteNext(seg.Next())
			}

			// Update the RACK fields if SACK is enabled.
			if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
				s.rc.update(seg, rcvdSeg)
				s.rc.detectReorder(seg)
			}

			s.writeList.Remove(seg)

			// If SACK is enabled then only reduce outstanding if
			// the segment was not previously SACKED as these have
			// already been accounted for in SetPipe().
			if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
				s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
			} else {
				s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
			}
			seg.DecRef()
			ackLeft -= datalen
		}

		// Clear SACK information for all acked data.
		s.ep.scoreboard.Delete(s.SndUna)

		// Detect if the sender entered recovery spuriously.
		if s.inRecovery() {
			s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr)
		}

		// If we are not in fast recovery then update the congestion
		// window based on the number of acknowledged packets.
		if !s.FastRecovery.Active {
			s.cc.Update(originalOutstanding - s.Outstanding)
			if s.FastRecovery.Last.LessThan(s.SndUna) {
				s.state = tcpip.Open
				// Update RACK when we are exiting fast or RTO
				// recovery as described in the RFC
				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
				if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
					s.rc.exitRecovery()
				}
				s.reorderTimer.disable()
			}
		}

		// Update the send buffer usage and notify potential waiters.
		s.ep.updateSndBufferUsage(int(acked))

		// It is possible for s.outstanding to drop below zero if we get
		// a retransmit timeout, reset outstanding to zero but later
		// get an ack that cover previously sent data.
		if s.Outstanding < 0 {
			s.Outstanding = 0
		}

		s.SetPipe()

		// If all outstanding data was acknowledged the disable the timer.
		// RFC 6298 Rule 5.3
		if s.SndUna == s.SndNxt {
			s.Outstanding = 0
			// Reset firstRetransmittedSegXmitTime to the zero value.
			s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
			s.resendTimer.disable()
			s.probeTimer.disable()
		}
	}

	if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
		// Update RACK reorder window.
		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
		//	* Upon receiving an ACK:
		//	* Step 4: Update RACK reordering window
		s.rc.updateRACKReorderWindow()

		// After the reorder window is calculated, detect any loss by checking
		// if the time elapsed after the segments are sent is greater than the
		// reorder window.
		if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
			// If any segment is marked as lost by
			// RACK, enter recovery and retransmit
			// the lost segments.
			s.cc.HandleLossDetected()
			s.enterRecovery()
			fastRetransmit = true
		}

		if s.FastRecovery.Active {
			s.rc.DoRecovery(nil, fastRetransmit)
		}
	}

	// Now that we've popped all acknowledged data from the retransmit
	// queue, retransmit if needed.
	if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
		// When SACK is enabled data sending is governed by steps in
		// RFC 6675 Section 5 recovery steps  A-C.
		// See: https://tools.ietf.org/html/rfc6675#section-5.
		if s.ep.SACKPermitted {
			return
		}
	}

	// Send more data now that some of the pending data has been ack'd, or
	// that the window opened up, or the congestion window was inflated due
	// to a duplicate ack during fast recovery. This will also re-enable
	// the retransmit timer if needed.
	s.sendData()
}

// sendSegment sends the specified segment.
// +checklocks:s.ep.mu
func (s *sender) sendSegment(seg *segment) tcpip.Error {
	if seg.xmitCount > 0 {
		s.ep.stack.Stats().TCP.Retransmits.Increment()
		s.ep.stats.SendErrors.Retransmits.Increment()
		if s.SndCwnd < s.Ssthresh {
			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
		}
	}
	seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
	seg.xmitCount++
	seg.lost = false

	err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber)

	// Every time a packet containing data is sent (including a
	// retransmission), if SACK is enabled and we are retransmitting data
	// then use the conservative timer described in RFC6675 Section 6.0,
	// otherwise follow the standard time described in RFC6298 Section 5.1.
	if err != nil && seg.payloadSize() != 0 {
		if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
			s.resendTimer.enable(s.RTO)
		} else {
			if !s.resendTimer.enabled() {
				s.resendTimer.enable(s.RTO)
			}
		}
	}

	return err
}

// sendSegmentFromPacketBuffer sends a new segment containing the given payload,
// flags and sequence number.
// +checklocks:s.ep.mu
// +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
func (s *sender) sendSegmentFromPacketBuffer(pkt stack.PacketBufferPtr, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
	if seq == s.RTTMeasureSeqNum {
		s.RTTMeasureTime = s.LastSendTime
	}

	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()

	// Remember the max sent ack.
	s.MaxSentAck = rcvNxt

	// We need to clone the packet because sendRaw takes ownership of pkt,
	// and pkt could be reprocessed later on (i.e retrasmission).
	pkt = pkt.Clone()
	defer pkt.DecRef()

	return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd)
}

// sendEmptySegment sends a new empty segment, flags and sequence number.
// +checklocks:s.ep.mu
// +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu
func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
	s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
	if seq == s.RTTMeasureSeqNum {
		s.RTTMeasureTime = s.LastSendTime
	}

	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()

	// Remember the max sent ack.
	s.MaxSentAck = rcvNxt

	return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd)
}

// maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
// currently.
// +checklocks:s.ep.mu
func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
	// Data packets are unlikely to be part of an ACK loop. So always send
	// an ACK for a packet w/ data.
	if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
		s.sendAck()
	}
}

func (s *sender) updateWriteNext(seg *segment) {
	if s.writeNext != nil {
		s.writeNext.DecRef()
	}
	if seg != nil {
		seg.IncRef()
	}
	s.writeNext = seg
}