1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
|
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build amd64 || arm64
// +build amd64 arm64
// Package xdp provides tools for working with AF_XDP sockets.
//
// AF_XDP shares a memory area (UMEM) with the kernel to pass packets
// back and forth. Communication is done via a number of queues.
// Briefly, the queues work as follows:
//
// - Receive: Userspace adds a descriptor to the fill queue. The
// descriptor points to an area of the UMEM that the kernel should fill
// with an incoming packet. The packet is filled by the kernel, which
// places a descriptor to the same UMEM area in the RX queue, signifying
// that userspace may read the packet.
// - Transmit: Userspace adds a descriptor to TX queue. The kernel
// sends the packet (stored in UMEM) pointed to by the descriptor.
// Upon completion, the kernel places a descriptor in the completion
// queue to notify userspace that the packet is sent and the UMEM
// area can be reused.
//
// So in short: RX packets move from the fill to RX queue, and TX
// packets move from the TX to completion queue.
//
// Note that the shared UMEM for RX and TX means that packet forwarding
// can be done without copying; only the queues need to be updated to point to
// the packet in UMEM.
package xdp
import (
"fmt"
"math/bits"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
)
// A ControlBlock contains all the control structures necessary to use an
// AF_XDP socket.
//
// The ControlBlock and the structures it contains are meant to be used with a
// single RX goroutine and a single TX goroutine.
type ControlBlock struct {
UMEM UMEM
Fill FillQueue
RX RXQueue
TX TXQueue
Completion CompletionQueue
}
// Opts configure an AF_XDP socket.
type Opts struct {
NFrames uint32
FrameSize uint32
NDescriptors uint32
Bind bool
UseNeedWakeup bool
}
// DefaultOpts provides recommended default options for initializing an AF_XDP
// socket. AF_XDP setup is extremely finnicky and can fail if incorrect values
// are used.
func DefaultOpts() Opts {
return Opts{
NFrames: 4096,
// Frames must be 2048 or 4096 bytes, although not all drivers support
// both.
FrameSize: 4096,
NDescriptors: 2048,
}
}
// New returns an initialized AF_XDP socket bound to a particular interface and
// queue.
func New(ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) {
sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0)
if err != nil {
return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err)
}
return NewFromSocket(sockfd, ifaceIdx, queueID, opts)
}
// NewFromSocket takes an AF_XDP socket, initializes it, and binds it to a
// particular interface and queue.
func NewFromSocket(sockfd int, ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) {
if opts.FrameSize != 2048 && opts.FrameSize != 4096 {
return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize)
}
if bits.OnesCount32(opts.NDescriptors) != 1 {
return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors)
}
var cb ControlBlock
// Create the UMEM area. Use mmap instead of make([[]byte) to ensure
// that the UMEM is page-aligned. Aligning the UMEM keeps individual
// packets from spilling over between pages.
var zerofd uintptr
umemMemory, err := memutil.MapSlice(
0,
uintptr(opts.NFrames*opts.FrameSize),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_PRIVATE|unix.MAP_ANONYMOUS,
zerofd-1,
0,
)
if err != nil {
return nil, fmt.Errorf("failed to mmap umem: %v", err)
}
cleanup := cleanup.Make(func() {
memutil.UnmapSlice(umemMemory)
})
if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 {
return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory))
}
cb.UMEM = UMEM{
mem: umemMemory,
sockfd: uint32(sockfd),
frameAddresses: make([]uint64, opts.NFrames),
nFreeFrames: opts.NFrames,
frameMask: ^(uint64(opts.FrameSize) - 1),
}
// Fill in each frame address.
for i := range cb.UMEM.frameAddresses {
cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize)
}
// Check whether we're likely to fail due to RLIMIT_MEMLOCK.
var rlimit unix.Rlimit
if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil {
return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err)
}
if rlimit.Cur < uint64(len(cb.UMEM.mem)) {
log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit)
}
reg := unix.XDPUmemReg{
Addr: uint64(sliceBackingPointer(umemMemory)),
Len: uint64(len(umemMemory)),
Chunk_size: opts.FrameSize,
// Not useful in the RX path.
Headroom: 0,
// TODO(b/240191988): Investigate use of SHARED flag.
Flags: 0,
}
if err := registerUMEM(sockfd, reg); err != nil {
return nil, fmt.Errorf("failed to register UMEM: %v", err)
}
// Set the number of descriptors in the fill queue.
if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil {
return nil, fmt.Errorf("failed to register fill ring: %v", err)
}
// Set the number of descriptors in the completion queue.
if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil {
return nil, fmt.Errorf("failed to register completion ring: %v", err)
}
// Set the number of descriptors in the RX queue.
if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil {
return nil, fmt.Errorf("failed to register RX queue: %v", err)
}
// Set the number of descriptors in the TX queue.
if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil {
return nil, fmt.Errorf("failed to register TX queue: %v", err)
}
// Get offset information for the queues. Offsets indicate where, once
// we mmap space for each queue, values in the queue are. They give
// offsets for the shared pointers, a shared flags value, and the
// beginning of the ring of descriptors.
off, err := getOffsets(sockfd)
if err != nil {
return nil, fmt.Errorf("failed to get offsets: %v", err)
}
// Allocate space for the fill queue.
fillQueueMem, err := memutil.MapSlice(
0,
uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_SHARED|unix.MAP_POPULATE,
uintptr(sockfd),
unix.XDP_UMEM_PGOFF_FILL_RING,
)
if err != nil {
return nil, fmt.Errorf("failed to mmap fill queue: %v", err)
}
cleanup.Add(func() {
memutil.UnmapSlice(fillQueueMem)
})
// Setup the fillQueue with offsets into allocated memory.
cb.Fill = FillQueue{
mem: fillQueueMem,
mask: opts.NDescriptors - 1,
cachedConsumer: opts.NDescriptors,
}
cb.Fill.init(off, opts)
// Allocate space for the completion queue.
completionQueueMem, err := memutil.MapSlice(
0,
uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_SHARED|unix.MAP_POPULATE,
uintptr(sockfd),
unix.XDP_UMEM_PGOFF_COMPLETION_RING,
)
if err != nil {
return nil, fmt.Errorf("failed to mmap completion queue: %v", err)
}
cleanup.Add(func() {
memutil.UnmapSlice(completionQueueMem)
})
// Setup the completionQueue with offsets into allocated memory.
cb.Completion = CompletionQueue{
mem: completionQueueMem,
mask: opts.NDescriptors - 1,
}
cb.Completion.init(off, opts)
// Allocate space for the RX queue.
rxQueueMem, err := memutil.MapSlice(
0,
uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_SHARED|unix.MAP_POPULATE,
uintptr(sockfd),
unix.XDP_PGOFF_RX_RING,
)
if err != nil {
return nil, fmt.Errorf("failed to mmap RX queue: %v", err)
}
cleanup.Add(func() {
memutil.UnmapSlice(rxQueueMem)
})
// Setup the rxQueue with offsets into allocated memory.
cb.RX = RXQueue{
mem: rxQueueMem,
mask: opts.NDescriptors - 1,
}
cb.RX.init(off, opts)
// Allocate space for the TX queue.
txQueueMem, err := memutil.MapSlice(
0,
uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()),
unix.PROT_READ|unix.PROT_WRITE,
unix.MAP_SHARED|unix.MAP_POPULATE,
uintptr(sockfd),
unix.XDP_PGOFF_TX_RING,
)
if err != nil {
return nil, fmt.Errorf("failed to mmap tx queue: %v", err)
}
cleanup.Add(func() {
memutil.UnmapSlice(txQueueMem)
})
// Setup the txQueue with offsets into allocated memory.
cb.TX = TXQueue{
sockfd: uint32(sockfd),
mem: txQueueMem,
mask: opts.NDescriptors - 1,
cachedConsumer: opts.NDescriptors,
}
cb.TX.init(off, opts)
// In some cases we don't call bind, as we're not in the netns with the
// device. In those cases, another process with the same socket will
// bind for us.
if opts.Bind {
if err := Bind(sockfd, ifaceIdx, queueID, opts.UseNeedWakeup); err != nil {
return nil, fmt.Errorf("failed to bind to interface %d: %v", ifaceIdx, err)
}
}
cleanup.Release()
return &cb, nil
}
// Bind binds a socket to a particular network interface and queue.
func Bind(sockfd int, ifindex, queueID uint32, useNeedWakeup bool) error {
var flags uint16
if useNeedWakeup {
flags |= unix.XDP_USE_NEED_WAKEUP
}
addr := unix.SockaddrXDP{
// XDP_USE_NEED_WAKEUP lets the driver sleep if there is no
// work to do. It will need to be woken by poll. It is expected
// that this improves performance by preventing the driver from
// burning cycles.
//
// By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct
// the kernel to use zerocopy if available and then fallback to
// copy mode.
Flags: flags,
Ifindex: ifindex,
// AF_XDP sockets are per device RX queue, although multiple
// sockets on multiple queues (or devices) can share a single
// UMEM.
QueueID: queueID,
// We're not using shared mode, so the value here is irrelevant.
SharedUmemFD: 0,
}
return unix.Bind(sockfd, &addr)
}
|