1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
|
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build amd64
// +build amd64
package usertrap
import (
"encoding/binary"
"fmt"
"math/rand"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
// trapNR is the maximum number of traps what can fit in the trap table.
const trapNR = 256
// trapSize is the size of one trap.
const trapSize = 80
var (
// jmpInst is the binary code of "jmp *addr".
jmpInst = [7]byte{0xff, 0x24, 0x25, 0, 0, 0, 0}
jmpInstOpcodeLen = 3
// faultInst is the single byte invalid instruction.
faultInst = [1]byte{0x6}
// faultInstOffset is the offset of the syscall instruction.
faultInstOffset = uintptr(5)
)
type memoryManager interface {
usermem.IO
MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error)
FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error)
}
// State represents the current state of the trap table.
//
// +stateify savable
type State struct {
mu sync.RWMutex `state:"nosave"`
nextTrap uint32
tableAddr hostarch.Addr
}
// New returns the new state structure.
func New() *State {
return &State{}
}
// +marshal
type header struct {
nextTrap uint32
}
func (s *State) trapAddr(trap uint32) hostarch.Addr {
return s.tableAddr + hostarch.Addr(trapSize*trap)
}
// newTrapLocked allocates a new trap entry.
//
// Preconditions: s.mu must be locked.
func (s *State) newTrapLocked(ctx context.Context, mm memoryManager) (hostarch.Addr, error) {
var hdr header
task := kernel.TaskFromContext(ctx)
if task == nil {
return 0, fmt.Errorf("no task found")
}
// s.nextTrap is zero if it isn't initialized. Here are three cases
// when this can happen:
// * A usertrap vma has not been mapped yet.
// * The address space has been forked.
// * The address space has been restored.
// nextTrap is saved on the usertrap vma to handle the third and second
// cases.
if s.nextTrap == 0 {
addr, off, err := mm.FindVMAByName(trapTableAddrRange, tableHint)
if off != 0 {
return 0, fmt.Errorf("the usertrap vma has been overmounted")
}
if err != nil {
// The usertrap table has not been mapped yet.
addr := hostarch.Addr(rand.Int63n(int64(trapTableAddrRange.Length()-trapTableSize))).RoundDown() + trapTableAddrRange.Start
ctx.Debugf("Map a usertrap vma at %x", addr)
if err := loadUsertrap(ctx, mm, addr); err != nil {
return 0, err
}
// The first cell in the table is used to save an index of a
// next unused trap.
s.nextTrap = 1
s.tableAddr = addr
} else if _, err := hdr.CopyIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), addr); err != nil {
return 0, err
} else {
// Read an index of a next unused trap.
s.nextTrap = hdr.nextTrap
s.tableAddr = addr
}
}
ctx.Debugf("Allocate a new trap: %p %d", s, s.nextTrap)
if s.nextTrap >= trapNR {
ctx.Warningf("No space in the trap table")
return 0, fmt.Errorf("no space in the trap table")
}
trap := s.nextTrap
s.nextTrap++
// An entire trap has to be on the same page to avoid memory faults.
addr := s.trapAddr(trap)
if addr/hostarch.PageSize != (addr+trapSize)/hostarch.PageSize {
trap = s.nextTrap
s.nextTrap++
}
hdr = header{
nextTrap: s.nextTrap,
}
if _, err := hdr.CopyOut(task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true}), s.tableAddr); err != nil {
return 0, err
}
return s.trapAddr(trap), nil
}
// trapTableAddrRange is the range where a trap table can be placed.
//
// The value has to be below 2GB and the high two bytes has to be an invalid
// instruction. In case of 0x60000, the high two bytes is 0x6. This is "push
// es" in x86 and the bad instruction on x64.
var trapTableAddrRange = hostarch.AddrRange{Start: 0x60000, End: 0x70000}
const (
trapTableSize = hostarch.Addr(trapNR * trapSize)
tableHint = "[usertrap]"
)
// LoadUsertrap maps the usertrap table into the address space.
func loadUsertrap(ctx context.Context, mm memoryManager, addr hostarch.Addr) error {
size, _ := hostarch.Addr(trapTableSize).RoundUp()
// Force is true because Addr is below MinUserAddress.
_, err := mm.MMap(ctx, memmap.MMapOpts{
Force: true,
Unmap: true,
Fixed: true,
Addr: addr,
Length: uint64(size),
Private: true,
Hint: tableHint,
MLockMode: memmap.MLockEager,
Perms: hostarch.AccessType{
Write: false,
Read: true,
Execute: true,
},
MaxPerms: hostarch.AccessType{
Write: true,
Read: true,
Execute: true,
},
})
if err != nil {
return err
}
return nil
}
// PatchSyscall changes the syscall instruction into a function call.
func (s *State) PatchSyscall(ctx context.Context, ac *arch.Context64, mm memoryManager) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
return fmt.Errorf("no task found")
}
s.mu.Lock()
defer s.mu.Unlock()
sysno := ac.SyscallNo()
patchAddr := ac.IP() - uintptr(len(jmpInst))
prevCode := make([]uint8, len(jmpInst))
if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(patchAddr), prevCode); err != nil {
return err
}
// Check that another thread has not patched this syscall yet.
// 0xb8 is the first byte of "mov sysno, %eax".
if prevCode[0] == uint8(0xb8) {
ctx.Debugf("Found the pattern at ip %x:sysno %d", patchAddr, sysno)
trapAddr, err := s.addTrapLocked(ctx, ac, mm, uint32(sysno))
if trapAddr == 0 || err != nil {
ctx.Warningf("Failed to add a new trap: %v", err)
return nil
}
// Replace "mov sysno, %eax; syscall" with "jmp trapAddr".
newCode := make([]uint8, len(jmpInst))
copy(newCode[:jmpInstOpcodeLen], jmpInst[:jmpInstOpcodeLen])
binary.LittleEndian.PutUint32(newCode[jmpInstOpcodeLen:], uint32(trapAddr))
ctx.Debugf("Apply the binary patch addr %x trap addr %x (%v -> %v)", patchAddr, trapAddr, prevCode, newCode)
ignorePermContext := task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true})
// The patch can't be applied atomically, so we need to
// guarantee that in each moment other threads will read a
// valid set of instructions, detect any inconsistent states
// and restart the patched code if so.
//
// A subtle aspect is the address at which the user trap table
// is always mapped which is 0x60000. The first byte of this is
// 0x06 which is an invalid opcode. That’s why when we
// overwrite all the bytes but the first 1 in the second step
// it works fine since the jump address still writes a 0x6 at
// the location of the first byte of syscall instruction that
// we are removing and any threads reading the instructions
// will still fault at the same place.
//
// Another subtle aspect is the second step is done using a
// regular non-atomic write which means a thread decoding the
// mov instruction could read a garbage value of the immediate
// operand for the ‘mov sysyno, %eax” instruction. But it
// doesn’t matter since we don’t change the first byte which is
// the one that contains the opcode. Also since the thread will
// fault on the 0x6 right after and will be restarted with the
// patched code the mov reading a garbage immediate operand
// doesn’t impact correctness.
// The patch is applied in three steps:
//
// The first step is to replace the first byte of the syscall
// instruction by one-byte invalid instruction (0x06), so that
// other threads which have passed the mov instruction fault on
// the invalid instruction and restart a patched code.
faultInstB := primitive.ByteSlice(faultInst[:])
if _, err := faultInstB.CopyOut(ignorePermContext, hostarch.Addr(patchAddr+faultInstOffset)); err != nil {
return err
}
// The second step is to replace all bytes except the first one
// which is the opcode of the mov instruction, so that the first
// five bytes remain "mov XXX, %rax".
if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr+1), newCode[1:]); err != nil {
return err
}
// The final step is to replace the first byte of the patch.
// After this point, all threads will read the valid jmp
// instruction.
if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr), newCode[0:1]); err != nil {
return err
}
}
return nil
}
// HandleFault handles a fault on a patched syscall instruction.
//
// When we replace a system call with a function call, we replace two
// instructions with one instruction. This means that here can be a thread
// which called the first instruction, then another thread applied a binary
// patch and the first thread calls the second instruction.
//
// To handle this case, the function call (jmp) instruction is constructed so
// that the first byte of the syscall instruction is changed with the one-byte
// invalid instruction (0x6). And in case of the race, the first thread will
// fault on the invalid instruction and HandleFault will restart the function
// call.
func (s *State) HandleFault(ctx context.Context, ac *arch.Context64, mm memoryManager) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
return fmt.Errorf("no task found")
}
s.mu.RLock()
defer s.mu.RUnlock()
code := make([]uint8, len(jmpInst))
ip := ac.IP() - faultInstOffset
if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(ip), code); err != nil {
return err
}
for i := 0; i < jmpInstOpcodeLen; i++ {
if code[i] != jmpInst[i] {
return nil
}
}
for i := 0; i < len(faultInst); i++ {
if code[i+int(faultInstOffset)] != faultInst[i] {
return nil
}
}
regs := &ac.StateData().Regs
if regs.Rax == uint64(unix.SYS_RESTART_SYSCALL) {
// restart_syscall is usually set by the Sentry to restart a
// system call after interruption by a stop signal. The Sentry
// sets RAX and moves RIP back on the size of the syscall
// instruction.
//
// RAX can't be set to SYS_RESTART_SYSCALL due to a race with
// injecting a function call, because neither of the two first
// bytes are equal to proper bytes of jmpInst.
regs.Orig_rax = regs.Rax
regs.Rip += arch.SyscallWidth
return ErrFaultSyscall
}
ac.SetIP(ip)
return ErrFaultRestart
}
// PreFork locks the trap table for reading. This call guarantees that the trap
// table will not be changed before the next PostFork call.
// +checklocksacquireread:s.mu
func (s *State) PreFork() {
s.mu.RLock()
}
// PostFork unlocks the trap table.
// +checklocksreleaseread:s.mu
func (s *State) PostFork() {
s.mu.RUnlock()
}
|