1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
|
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lisafs
import (
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sync"
)
// numStaticChildren is the number of static children tracked by each node.
// Sampling certain filesystem heavy workloads showed that a majority of
// directories store at most 5 children in their map. This should be kept low
// to minimize the memory overhead for each node. 5 is fairly low and at the
// same time helps avoid map allocations for majority of nodes. Benchmarking
// also showed that static arrays are faster than maps for lookups until n=8.
const numStaticChildren = 5
// Node is a node on the filesystem tree. A Node is shared by all the
// ControlFDs opened on that position. For a given Server, there will only be
// one Node for a given filesystem position.
//
// Reference Model:
// - Each node holds a ref on its parent for its entire lifetime.
type Node struct {
// node's ref count is protected by its parent's childrenMu.
nodeRefs
// opMu synchronizes high level operations on this path.
//
// It is used to ensure the following which are important for security:
// * This node's data is protected by opMu. So all operations that change its
// data should hold opMu for writing. For example: write, setstat, setxattr,
// etc. This entails that if this node represents a directory, creation and
// deletion operations happening directly under this directory must lock
// opMu for writing. All operations accessing data must hold opMu for
// reading. This is to avoid the can of worms that open when creation and
// deletion are allowed to race. This prevents any walks from occurring
// during creation or deletion.
// * When this node is being deleted, the deletion handler must hold opMu for
// writing. This ensures that there are no concurrent operations going on
// this node while it is being deleted and potentially being replaced with
// something hazardous.
//
// A useful consequence of the above is that holding opMu for reading
// guarantees that the Server can not change Nodes on the path until this
// Node. For instance, if the grandparent needs to be renamed or deleted,
// the client must first delete this node to avoid ENOTEMPTY error. Deleting
// this node is not possible while opMu is read locked.
opMu sync.RWMutex
// deleted indicates whether the backing file has been unlinked. This can be
// used to deny operations on FDs on this Node after deletion because it is
// not safe for FD implementations to do host walks up to this position
// anymore. This node may have been replaced with something hazardous.
// deleted is protected by opMu. deleted must only be accessed/mutated using
// atomics; see markDeletedRecursive for more details.
deleted atomicbitops.Uint32
// name is the name of the file represented by this Node in parent. If this
// FD represents the root directory, then name is an empty string. name is
// protected by the backing server's rename mutex.
name string
// parent is this parent node which tracks this node as a child. parent is
// protected by the backing server's rename mutex.
parent *Node
// controlFDs is a linked list of all the ControlFDs opened on this node.
// Prefer this over a slice to avoid additional allocations. Each ControlFD
// is an implicit linked list node so there are no additional allocations
// needed to maintain the linked list.
controlFDsMu sync.Mutex
controlFDs controlFDList
// Here is a performance hack. Past experience has shown that map allocations
// on each node for tracking children costs a lot of memory. More small
// allocations also fragment memory. To save allocations, statically track
// upto numStaticChildren children using hardcoded pointers. If more children
// are inserted then move to a map. Use dynamicChildren iff it is non-nil.
// The folowing fields are protected by childrenMu.
childrenMu sync.Mutex
staticChildren [numStaticChildren]struct {
name string
node *Node
}
dynamicChildren map[string]*Node
}
// DecRef implements refs.RefCounter.DecRef. Note that the context
// parameter should never be used. It exists solely to comply with the
// refs.RefCounter interface.
//
// Precondition: server's rename mutex must be at least read locked.
func (n *Node) DecRef(context.Context) {
if n.parent == nil {
n.nodeRefs.DecRef(nil)
return
}
// If this is the only ref on node then it will need to be destroyed.
n.parent.childrenMu.Lock()
deleted := false
n.nodeRefs.DecRef(func() {
n.parent.removeChildLocked(n.name)
deleted = true
})
n.parent.childrenMu.Unlock()
if deleted {
// Drop ref on parent. Keep Decref call lock free for scalability.
n.parent.DecRef(nil)
}
}
// InitLocked must be called before first use of fd.
//
// Precondition: parent.childrenMu is locked.
//
// Postconditions: A ref on n is transferred to the caller.
func (n *Node) InitLocked(name string, parent *Node) {
n.nodeRefs.InitRefs()
n.name = name
n.parent = parent
if parent != nil {
parent.IncRef()
parent.insertChildLocked(name, n)
}
}
// LookupChildLocked looks up for a child with given name. Returns nil if child
// does not exist.
//
// Preconditions: childrenMu is locked.
func (n *Node) LookupChildLocked(name string) *Node {
if n.dynamicChildren != nil {
return n.dynamicChildren[name]
}
for i := 0; i < numStaticChildren; i++ {
if n.staticChildren[i].name == name {
return n.staticChildren[i].node
}
}
return nil
}
// WithChildrenMu executes fn with n.childrenMu locked.
func (n *Node) WithChildrenMu(fn func()) {
n.childrenMu.Lock()
defer n.childrenMu.Unlock()
fn()
}
// FilePath returns the absolute path of the backing file. This is an expensive
// operation. The returned path should be free of any intermediate symlinks
// because all internal (non-leaf) nodes are directories.
//
// Precondition:
// - server's rename mutex must be at least read locked. Calling handlers must
// at least have read concurrency guarantee from the server.
func (n *Node) FilePath() string {
// Walk upwards and prepend name to res.
var res fspath.Builder
for n.parent != nil {
res.PrependComponent(n.name)
n = n.parent
}
// n is the root node.
res.PrependByte('/')
return res.String()
}
func (n *Node) isDeleted() bool {
return n.deleted.Load() != 0
}
func (n *Node) removeFD(fd *ControlFD) {
n.controlFDsMu.Lock()
defer n.controlFDsMu.Unlock()
n.controlFDs.Remove(fd)
}
func (n *Node) insertFD(fd *ControlFD) {
n.controlFDsMu.Lock()
defer n.controlFDsMu.Unlock()
n.controlFDs.PushBack(fd)
}
func (n *Node) forEachFD(fn func(*ControlFD)) {
n.controlFDsMu.Lock()
defer n.controlFDsMu.Unlock()
for fd := n.controlFDs.Front(); fd != nil; fd = fd.Next() {
fn(fd)
}
}
// removeChildLocked removes child with given name from n and returns the
// removed child. Returns nil if no such child existed.
//
// Precondition: childrenMu is locked.
func (n *Node) removeChildLocked(name string) *Node {
if n.dynamicChildren != nil {
toRemove := n.dynamicChildren[name]
delete(n.dynamicChildren, name)
return toRemove
}
for i := 0; i < numStaticChildren; i++ {
if n.staticChildren[i].name == name {
toRemove := n.staticChildren[i].node
n.staticChildren[i].name = ""
n.staticChildren[i].node = nil
return toRemove
}
}
return nil
}
// insertChildLocked inserts child into n. It does not check for duplicates.
//
// Precondition: childrenMu is locked.
func (n *Node) insertChildLocked(name string, child *Node) {
// Try to insert statically first if staticChildren is still being used.
if n.dynamicChildren == nil {
for i := 0; i < numStaticChildren; i++ {
if n.staticChildren[i].node == nil {
n.staticChildren[i].node = child
n.staticChildren[i].name = name
return
}
}
// Ran out of static space. Need to start inserting dynamically.
// Shift everything to the map.
n.dynamicChildren = make(map[string]*Node)
for i := 0; i < numStaticChildren; i++ {
// From above loop we know all staticChildren entries are non-nil.
n.dynamicChildren[n.staticChildren[i].name] = n.staticChildren[i].node
n.staticChildren[i].name = ""
n.staticChildren[i].node = nil
}
}
n.dynamicChildren[name] = child
}
func (n *Node) forEachChild(fn func(*Node)) {
n.childrenMu.Lock()
defer n.childrenMu.Unlock()
if n.dynamicChildren != nil {
for _, child := range n.dynamicChildren {
fn(child)
}
return
}
for i := 0; i < numStaticChildren; i++ {
if n.staticChildren[i].node != nil {
fn(n.staticChildren[i].node)
}
}
}
// Precondition: opMu must be locked for writing on the root node being marked
// as deleted.
func (n *Node) markDeletedRecursive() {
n.deleted.Store(1)
// No need to hold opMu for children as it introduces lock ordering issues
// because forEachChild locks childrenMu. Locking opMu after childrenMu
// violates the lock ordering. Anyway if a directory is being deleted, it
// must not have children. The client must have already deleted the entire
// subtree. If the client did not delete this subtree nodes, then the subtree
// was deleted externally and there is not much we can do. This is best
// effort work to mark the subtree as deleted.
n.forEachChild(func(child *Node) {
child.markDeletedRecursive()
})
}
|