1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
|
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs
import (
"fmt"
"math/bits"
"sync/atomic"
"unsafe"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sync"
)
// mountKey represents the location at which a Mount is mounted. It is
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
//
// This is explicitly not savable.
type mountKey struct {
parent unsafe.Pointer // *Mount
point unsafe.Pointer // *Dentry
}
var (
mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil))
mountKeySeed = sync.RandUintptr()
)
func (k *mountKey) hash() uintptr {
return mountKeyHasher(gohacks.Noescape(unsafe.Pointer(k)), mountKeySeed)
}
func (mnt *Mount) parent() *Mount {
return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
}
func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
func (mnt *Mount) getKey() VirtualDentry {
return VirtualDentry{
mount: mnt.parent(),
dentry: mnt.point(),
}
}
// Invariant: mnt.key.parent == nil. vd.Ok().
func (mnt *Mount) setKey(vd VirtualDentry) {
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
type mountTable struct {
// mountTable is implemented as a seqcount-protected hash table that
// resolves collisions with linear probing, featuring Robin Hood insertion
// and backward shift deletion. These minimize probe length variance,
// significantly improving the performance of linear probing at high load
// factors. (mountTable doesn't use bucketing, which is the other major
// technique commonly used in high-performance hash tables; the efficiency
// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
// intrinsics and inline assembly, limiting the performance of this
// approach.)
seq sync.SeqCount `state:"nosave"`
// size holds both length (number of elements) and capacity (number of
// slots): capacity is stored as its base-2 log (referred to as order) in
// the least significant bits of size, and length is stored in the
// remaining bits. Go defines bit shifts >= width of shifted unsigned
// operand as shifting to 0, which differs from x86's SHL, so the Go
// compiler inserts a bounds check for each bit shift unless we mask order
// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
// thus this bit packing gets us more bits for the length (vs. storing
// length and cap in separate uint32s) for ~free.
size atomicbitops.Uint64
slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
type mountSlot struct {
// We don't store keys in slots; instead, we just check Mount.parent and
// Mount.point directly. Any practical use of lookup will need to touch
// Mounts anyway, and comparing hashes means that false positives are
// extremely rare, so this isn't an extra cache line touch overall.
value unsafe.Pointer // *Mount
hash uintptr
}
const (
mtSizeOrderBits = 6 // log2 of pointer size in bits
mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
mtSizeOrderOne = 1
mtSizeLenLSB = mtSizeOrderBits
mtSizeLenOne = 1 << mtSizeLenLSB
mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
mountSlotBytes = unsafe.Sizeof(mountSlot{})
mountKeyBytes = unsafe.Sizeof(mountKey{})
// Tuning parameters.
//
// Essentially every mountTable will contain at least /proc, /sys, and
// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
mtInitOrder = 2
mtInitCap = 1 << mtInitOrder
mtMaxLoadNum = 13
mtMaxLoadDen = 16
)
func init() {
// We can't just define mtSizeOrderBits as follows because Go doesn't have
// constexpr.
if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
}
if bits.OnesCount(uint(mountSlotBytes)) != 1 {
panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
}
if mtInitCap <= 1 {
panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
}
if mtMaxLoadNum >= mtMaxLoadDen {
panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
}
}
// Init must be called exactly once on each mountTable before use.
func (mt *mountTable) Init() {
mt.size = atomicbitops.FromUint64(mtInitOrder)
mt.slots = newMountTableSlots(mtInitCap)
}
func newMountTableSlots(cap uintptr) unsafe.Pointer {
slice := make([]mountSlot, cap, cap)
hdr := (*gohacks.SliceHeader)(unsafe.Pointer(&slice))
return hdr.Data
}
// Lookup returns the Mount with the given parent, mounted at the given point.
// If no such Mount exists, Lookup returns nil.
//
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
hash := key.hash()
loop:
for {
epoch := mt.seq.BeginRead()
size := mt.size.Load()
slots := atomic.LoadPointer(&mt.slots)
if !mt.seq.ReadOk(epoch) {
continue
}
tcap := uintptr(1) << (size & mtSizeOrderMask)
mask := tcap - 1
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
for {
// This avoids bounds checking.
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := atomic.LoadPointer(&slot.value)
slotHash := atomic.LoadUintptr(&slot.hash)
if !mt.seq.ReadOk(epoch) {
// The element we're looking for might have been moved into a
// slot we've previously checked, so restart entirely.
continue loop
}
if slotValue == nil {
return nil
}
if slotHash == hash {
mount := (*Mount)(slotValue)
var mountKey mountKey
mountKey.parent = atomic.LoadPointer(&mount.key.parent)
mountKey.point = atomic.LoadPointer(&mount.key.point)
if !mt.seq.ReadOk(epoch) {
continue loop
}
if key == mountKey {
return mount
}
}
off = (off + mountSlotBytes) & offmask
}
}
}
// Range calls f on each Mount in mt. If f returns false, Range stops iteration
// and returns immediately.
func (mt *mountTable) Range(f func(*Mount) bool) {
tcap := uintptr(1) << (mt.size.Load() & mtSizeOrderMask)
slotPtr := mt.slots
last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes))
for {
slot := (*mountSlot)(slotPtr)
if slot.value != nil {
if !f((*Mount)(slot.value)) {
return
}
}
if slotPtr == last {
return
}
slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes)
}
}
// Insert inserts the given mount into mt.
//
// Preconditions: mt must not already contain a Mount with the same mount point
// and parent.
func (mt *mountTable) Insert(mount *Mount) {
mt.seq.BeginWrite()
mt.insertSeqed(mount)
mt.seq.EndWrite()
}
// insertSeqed inserts the given mount into mt.
//
// Preconditions:
// - mt.seq must be in a writer critical section.
// - mt must not already contain a Mount with the same mount point and parent.
func (mt *mountTable) insertSeqed(mount *Mount) {
hash := mount.key.hash()
// We're under the maximum load factor if:
//
// (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
tlen := mt.size.RacyLoad() >> mtSizeLenLSB
order := mt.size.RacyLoad() & mtSizeOrderMask
tcap := uintptr(1) << order
if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
// Atomically insert the new element into the table.
mt.size.Add(mtSizeLenOne)
mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
return
}
// Otherwise, we have to expand. Double the number of slots in the new
// table.
newOrder := order + 1
if newOrder > mtSizeOrderMask {
panic("mount table size overflow")
}
newCap := uintptr(1) << newOrder
newSlots := newMountTableSlots(newCap)
// Copy existing elements to the new table.
oldCur := mt.slots
// Go does not permit pointers to the end of allocated objects, so we
// must use a pointer to the last element of the old table. The
// following expression is equivalent to
// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
// arithmetic instructions instead of 3.
oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
for {
oldSlot := (*mountSlot)(oldCur)
if oldSlot.value != nil {
mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
}
if oldCur == oldLast {
break
}
oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
}
// Insert the new element into the new table.
mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
// Switch to the new table.
mt.size.Add(mtSizeLenOne | mtSizeOrderOne)
atomic.StorePointer(&mt.slots, newSlots)
}
// Preconditions:
// - There are no concurrent mutators of the table (slots, cap).
// - If the table is visible to readers, then mt.seq must be in a writer
// critical section.
// - cap must be a power of 2.
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
mask := cap - 1
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
disp := uintptr(0)
for {
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := slot.value
if slotValue == nil {
atomic.StorePointer(&slot.value, value)
atomic.StoreUintptr(&slot.hash, hash)
return
}
// If we've been displaced farther from our first-probed slot than the
// element stored in this one, swap elements and switch to inserting
// the replaced one. (This is Robin Hood insertion.)
slotHash := slot.hash
slotDisp := ((off / mountSlotBytes) - slotHash) & mask
if disp > slotDisp {
atomic.StorePointer(&slot.value, value)
atomic.StoreUintptr(&slot.hash, hash)
value = slotValue
hash = slotHash
disp = slotDisp
}
off = (off + mountSlotBytes) & offmask
disp++
}
}
// Remove removes the given mount from mt.
//
// Preconditions: mt must contain mount.
func (mt *mountTable) Remove(mount *Mount) {
mt.seq.BeginWrite()
mt.removeSeqed(mount)
mt.seq.EndWrite()
}
// removeSeqed removes the given mount from mt.
//
// Preconditions:
// - mt.seq must be in a writer critical section.
// - mt must contain mount.
func (mt *mountTable) removeSeqed(mount *Mount) {
hash := mount.key.hash()
tcap := uintptr(1) << (mt.size.RacyLoad() & mtSizeOrderMask)
mask := tcap - 1
slots := mt.slots
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
for {
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := slot.value
if slotValue == unsafe.Pointer(mount) {
// Found the element to remove. Move all subsequent elements
// backward until we either find an empty slot, or an element that
// is already in its first-probed slot. (This is backward shift
// deletion.)
for {
nextOff := (off + mountSlotBytes) & offmask
nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
nextSlotValue := nextSlot.value
if nextSlotValue == nil {
break
}
nextSlotHash := nextSlot.hash
if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
break
}
atomic.StorePointer(&slot.value, nextSlotValue)
atomic.StoreUintptr(&slot.hash, nextSlotHash)
off = nextOff
slot = nextSlot
}
atomic.StorePointer(&slot.value, nil)
mt.size.Add(mtSizeLenNegOne)
return
}
if checkInvariants && slotValue == nil {
panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
}
off = (off + mountSlotBytes) & offmask
}
}
|