1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
// Copyright (c) Contributors to the Apptainer project, established as
// Apptainer a Series of LF Projects LLC.
// For website terms of use, trademark policy, privacy policy and other
// project policies see https://lfprojects.org/policies
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.
package instance
import (
"io"
"net"
"net/http"
"path/filepath"
"strconv"
"strings"
"testing"
"time"
"github.com/apptainer/apptainer/e2e/internal/e2e"
"github.com/apptainer/apptainer/internal/pkg/test/tool/require"
"github.com/cenkalti/backoff/v4"
)
const checkpointStateServerPort = 11000
func pollServer(t *testing.T, address string) {
op := func() error {
resp, err := http.Get(address)
if err != nil {
return err
}
resp.Body.Close()
return nil
}
b := backoff.WithMaxRetries(
backoff.NewConstantBackOff(1*time.Second), // Ping every second.
30, // Ping for a total of 30 seconds.
)
err := backoff.Retry(op, b)
if err != nil {
t.Fatalf("Unable to reach server after 30s: %v", err)
}
}
func getServerState(t *testing.T, address, expected string) {
resp, err := http.Get(address)
if err != nil {
t.Fatal(err)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatal(err)
}
resp.Body.Close()
if string(body) != expected {
t.Fatalf("Expected %q, got %q", expected, string(body))
}
}
func setServerState(t *testing.T, address, val string) {
resp, err := http.Post(address, "text/plain", strings.NewReader(val))
if err != nil {
t.Fatal(err)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatal(err)
}
resp.Body.Close()
if string(body) != val {
t.Fatalf("Expected %q, got %q", val, string(body))
}
}
// testCheckpointInstance runs through a basic checkpointing scenario with a python server
// that stores a variable in memory.
// NOTE(ian): The excessive sleep times are necessary when I run these tests locally since
// I get a "connect: connection refused" error when it is significantly shortened. It is
// unclear to my why this is the case as manual testing does not appear to require such delays.
func (c *ctx) testCheckpointInstance(t *testing.T) {
require.DMTCP(t)
imageDir, cleanup := e2e.MakeTempDir(t, c.env.TestDir, "checkpoint-", "")
defer e2e.Privileged(cleanup)(t)
imagePath := filepath.Join(imageDir, "state-server.sif")
checkpointName := randomName(t)
instanceName := randomName(t)
instanceAddress := "http://" + net.JoinHostPort("localhost", strconv.Itoa(checkpointStateServerPort))
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.RootProfile),
e2e.WithCommand("build"),
e2e.WithArgs("--force", imagePath, "testdata/state-server.def"),
e2e.ExpectExit(0),
)
// Create checkpoint
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("checkpoint"),
e2e.WithArgs("create", checkpointName),
e2e.ExpectExit(0),
)
// Start instance using the checkpoint with "--dmtcp-launch"
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("instance"),
e2e.WithArgs("start", "--dmtcp-launch", checkpointName, imagePath, instanceName, strconv.Itoa(checkpointStateServerPort)),
e2e.ExpectExit(0),
)
// Wait for server to come up
pollServer(t, instanceAddress)
// Check that server state is initialized to what we expect
getServerState(t, instanceAddress, "0")
// Set server state to something new before checkpointing
setServerState(t, instanceAddress, "1")
// Checkpoint instance
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("checkpoint"),
e2e.WithArgs("instance", instanceName),
e2e.ExpectExit(0),
)
// Give the checkpoint command some time to save state
time.Sleep(5 * time.Second)
// Stop instance
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("instance"),
e2e.WithArgs("stop", instanceName),
e2e.ExpectExit(0),
)
// Wait for socket to completely drain before restarting.
time.Sleep(1 * time.Minute)
// Start instance using the checkpoint with "--dmtcp-restart"
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("instance"),
e2e.WithArgs("start", "--dmtcp-restart", checkpointName, imagePath, instanceName),
e2e.ExpectExit(0),
)
// Wait for server to come up
pollServer(t, instanceAddress)
// Ensure server state after restart is what we set it to before checkpoint
getServerState(t, instanceAddress, "1")
// Stop instance
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("instance"),
e2e.WithArgs("stop", instanceName),
e2e.ExpectExit(0),
)
// Delete checkpoint
c.env.RunApptainer(
t,
e2e.WithProfile(e2e.UserProfile),
e2e.WithCommand("checkpoint"),
e2e.WithArgs("delete", checkpointName),
e2e.ExpectExit(0),
)
}
|