File: fuse.c

package info (click to toggle)
charliecloud 0.43-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,116 kB
  • sloc: python: 6,021; sh: 4,284; ansic: 3,863; makefile: 598
file content (273 lines) | stat: -rw-r--r-- 11,036 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/* Copyright © Triad National Security, LLC, and others. */

/* Function prefixes:

   fuse_     libfuse; docs: https://libfuse.github.io/doxygen/globals.html
   sqfs_ll_  SquashFUSE; no docs but: https://github.com/vasi/squashfuse
   sq_       Charliecloud */

#define _GNU_SOURCE
// config.h and fuse.h are below for this file

#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>

// SquashFUSE has a bug [1] where ll.h includes SquashFUSE's own config.h.
// This clashes with our own config.h, as well as the system headers because
// it defines _POSIX_C_SOURCE. By defining SQFS_CONFIG_H, SquashFUSE's
// config.h skips itself.
// [1]: https://github.com/vasi/squashfuse/issues/65
#define SQFS_CONFIG_H
// But then FUSE_USE_VERSION isn't defined, which makes other parts of ll.h
// puke. Looking at their code, it seems the only values used are 32 (for
// libfuse3) and 26 (for libfuse2), so we can just blindly define it.
#define FUSE_USE_VERSION 32
// SquashFUSE redefines __le16 unless HAVE_LINUX_TYPES_LE16 is defined. We are
// assuming it is defined in <linux/types.h> on your machine.
#define HAVE_LINUX_TYPES_LE16
// The forget operation in libfuse3 takes uint64_t as third parameter,
// while SquashFUSE defaults to unsigned long as used in libfuse2.
// This causes a mess on arches with different size of these types,
// so explicitly switch to the libfuse3 variant.
#define HAVE_FUSE_LL_FORGET_OP_64T
// Now we can include ll.h.
#include <squashfuse/ll.h>

#include "config.h"  // here to avoid potential clash with SquashFUSE config.h
#include "all.h"


/** Types **/

/* A SquashFUSE mount. SquashFUSE allocates ll for us but not chan; use
   pointers for both for consistency. */
struct squash {
   char *mountpt;       // path to mount point
   sqfs_ll_chan *chan;  // FUSE channel associated with SquashFUSE mount
   sqfs_ll *ll;         // SquashFUSE low-level data structure
};


/** Constants **/

/* This mapping tells libfuse what functions implement which FUSE operations.
   It is passed to sqfs_ll_mount(). Why it is not internal to SquashFUSE I
   have no idea. */
struct fuse_lowlevel_ops OPS = {
    .getattr    = &sqfs_ll_op_getattr,
    .opendir    = &sqfs_ll_op_opendir,
    .releasedir = &sqfs_ll_op_releasedir,
    .readdir    = &sqfs_ll_op_readdir,
    .lookup     = &sqfs_ll_op_lookup,
    .open       = &sqfs_ll_op_open,
    .create     = &sqfs_ll_op_create,
    .release    = &sqfs_ll_op_release,
    .read       = &sqfs_ll_op_read,
    .readlink   = &sqfs_ll_op_readlink,
    .listxattr  = &sqfs_ll_op_listxattr,
    .getxattr   = &sqfs_ll_op_getxattr,
    .forget     = &sqfs_ll_op_forget,
    .statfs     = &stfs_ll_op_statfs
};


/** Global variables **/

/* SquashFUSE mount. Initialized in sq_mount() and then used in most of the
   other functions in this file. It's a global because the signal handler
   needs access to it. */
struct squash sq;

/* True if exit request signal handler received SIGCHLD. */
volatile bool sigchld_received;

/* True if any exit request signal has been received. */
volatile bool loop_terminating = false;


/** Function prototypes (private) **/

void sq_done_request(int signum);
int sq_loop();
void sq_mount(const char *img_path, char *mountpt);


/** Functions **/

/* Signal handler to end the FUSE loop. This simply requests FUSE to end its
   loop, causing fuse_session_loop() to exit. */
void sq_done_request(int signum)
{
   if (!loop_terminating) {  // only act on first signal
      loop_terminating = true;
      sigchld_received = (signum == SIGCHLD);
      fuse_session_exit(sq.chan->session);
   }
}

/* Mount SquashFS archive c->img_path on directory c->newroot. If the latter
   is NULL, then mkdir(2) the default mount point and assign its path to
   c->newroot. After mounting, fork; the child returns immediately while the
   parent runs the FUSE loop until the child exits and then exits itself,
   with the same exit code as the child (unless something else went wrong). */
void sq_fork(struct container *c)
{
   pid_t pid_child;
   struct stat st;

   // Default mount point?
   if (c->newroot == NULL) {
      char *subdir = asprintf_ch("/%s.ch/mnt", username);
      c->newroot = cat("/var/tmp", subdir);
      VERBOSE("using default mount point: %s", c->newroot);
      mkdirs("/var/tmp", subdir, NULL, NULL);
   }

   // Verify mount point exists and is a directory. (SquashFS file path
   // already checked in img_type_get().)
   Zfe (stat(c->newroot, &st), "can't stat mount point: %s", c->newroot);
   Tf_ (S_ISDIR(st.st_mode), "not a directory: %s", c->newroot);

   // Mount SquashFS. Use PR_SET_NO_NEW_PRIVS to actively reject running
   // fusermount3(1) setuid, even if it’s installed that way.
   Zfe (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), "can't set no_new_privs");
   sq_mount(c->img_ref, c->newroot);

   // Now that the filesystem is mounted, we can fork without race condition.
   // The child returns to caller and runs the user command. When that exits,
   // the parent gets SIGCHLD.
   pid_child = fork_ch();
   if (pid_child > 0)  // parent (child does nothing here)
      exit(sq_loop());
}

/* Run the squash loop to completion and return the exit code of the user
   command. Warning: This sets up but does not restore signal handlers. */
int sq_loop(void)
{
   sigset_t mask;
   struct sigaction fin, ign;
   int looped, exit_code, child_status;

   // Unblock signals we wish to handle because we can't predict or assume mask
   // contents inherited from SLURM [0]. Use pthread_sigmask(3) instead of
   // sigprocmask(2) because it handles both single and multi-thread calling
   // process signal masks.
   //
   // [0]: https://www.gnu.org/software/libc/manual/html_node/Process-Signal-Mask.html
   Z__ (sigemptyset(&mask));
   Z__ (sigaddset(&mask, SIGCHLD));  // user command exits
   Z__ (sigaddset(&mask, SIGHUP));   // terminal/session terminated
   Z__ (sigaddset(&mask, SIGINT));   // Control-C
   Z__ (sigaddset(&mask, SIGPIPE));  // broken pipe; we don't use pipes
   Z__ (sigaddset(&mask, SIGTERM));  // somebody asked us to exit
   Z__ (pthread_sigmask(SIG_UNBLOCK, &mask, NULL));

   // Set up signal handlers. Avoid fuse_set_signal_handlers() because we need
   // to catch a different set of signals, letting some be handled by the user
   // command [1]. Use sigaction(2) instead of signal(2) because the latter's
   // man page [2] says “avoid its use” and there are reports of bad
   // interactions with libfuse [3].
   //
   // [1]: https://unix.stackexchange.com/questions/176235
   // [2]: https://man7.org/linux/man-pages/man2/signal.2.html
   // [3]: https://stackoverflow.com/a/8918597
   fin.sa_handler = sq_done_request;
   Z__ (sigemptyset(&fin.sa_mask));  // block no other signals during handling
   fin.sa_flags = SA_NOCLDSTOP;     // only SIGCHLD on child exit
   ign.sa_handler = SIG_IGN;
   Z__ (sigaction(SIGCHLD, &fin, NULL));
   Z__ (sigaction(SIGHUP,  &ign, NULL));
   Z__ (sigaction(SIGINT,  &ign, NULL));
   Z__ (sigaction(SIGPIPE, &ign, NULL));
   Z__ (sigaction(SIGTERM, &fin, NULL));

   // Run the FUSE loop, which services FUSE requests until sq_done_request()
   // is invoked by a signal and tells it to stop, or someone unmounts the
   // filesystem externally with e.g. fusermount(1). Because we don't use
   // fuse_set_signal_handlers(), the return value doesn't contain the signal
   // number that ended the loop, contrary to the documentation.
   //
   // FIXME: this is single-threaded; see issue #1157.
   looped = fuse_session_loop(sq.chan->session);
   if (looped < 0) {
      errno = -looped;  // restore encoded errno so our logging finds it
      Tfe (0, "FUSE session failed");
   }
   VERBOSE("FUSE loop terminated successfully");

   // Clean up zombie child if exit signal was SIGCHLD.
   if (!sigchld_received)
      exit_code = EXIT_ERR_SQUASH;
   else {
      Tfe (wait(&child_status) >= 0, "can't wait for child");
      if (WIFEXITED(child_status)) {
         exit_code = WEXITSTATUS(child_status);
         VERBOSE("child terminated normally with exit code %d", exit_code);
      } else {
         // We now know that the child did not exit normally; the two
         // remaining options are (a) killed by signal and (b) stopped [1].
         // Because we didn't call waitpid(2) with WUNTRACED, we don't get
         // notified if the child is stopped [2], so it must have been
         // signaled, and we need not call WIFSIGNALED().
         //
         // [1]: https://codereview.stackexchange.com/a/109349
         // [2]: https://man7.org/linux/man-pages/man2/wait.2.html
         exit_code = 128 + WTERMSIG(child_status);
         WARNING("child terminated by signal %d", WTERMSIG(child_status));
      }
   }

   // Clean up SquashFS mount. These functions have no error reporting.
   VERBOSE("unmounting: %s", sq.mountpt);
   sqfs_ll_destroy(sq.ll);
   sqfs_ll_unmount(sq.chan, sq.mountpt);

   VERBOSE("FUSE loop done");
   return exit_code;
}

/* Mount the SquashFS img_path at mountpt. Exit on any errors. */
void sq_mount(const char *img_path, char *mountpt)
{
   // SquashFUSE mount takes basically a command line rather than having a
   // standard library API. It's unclear to me where this command line is
   // documented, but the libfuse docs [1] suggest mount(8).
   // [1]: https://libfuse.github.io/doxygen/fuse-3_810_83_2include_2fuse_8h.html#ad866b0fd4d81bdbf3e737f7273ba4520
   char *mount_argv[] = {"WEIRDAL", "-d"};
   int mount_argc = (verbose > 3) ? 2 : 1;  // include -d if high verbosity
   struct fuse_args mount_args = FUSE_ARGS_INIT(mount_argc, mount_argv);

   sq.mountpt = mountpt;
   sq.chan = malloc_ch(sizeof(sqfs_ll_chan), true);

   sq.ll = sqfs_ll_open(img_path, 0);
   Tf_ (sq.ll != NULL, "can't open SquashFS: %s; try ch-run -vv?", img_path);

   // sqfs_ll_mount() is squirrely for a couple reasons:
   //
   //   1. Error reporting. We get back only SQFS_OK or SQFS_ERR, with no
   //      further detail. Looking at the source code [1], the latter says
   //      either fuse_session_new() or fuse_session_mount() failed, but we
   //      can't tell which, or get any further information about what went
   //      wrong. Hopefully fusermount3 also printed an error message.
   //
   //   2. Race condition. We have been seeing intermittent errors in the test
   //      suite about permission denied accessing the mount point (issue
   //      #1364). I *think* this is because a previous mount on the same
   //      location is not yet cleaned up. For this reason, we have a short
   //      retry loop.
   //
   // [1]: https://github.com/vasi/squashfuse/blob/74f4fe8/ll.c#L399
   for (int i = 5; true; i--)
      if (SQFS_OK == sqfs_ll_mount(sq.chan, sq.mountpt, &mount_args,
                                   &OPS, sizeof(OPS), sq.ll)) {
         break;  // success
      } else if (i <= 0) {
         FATAL(1, "too many FUSE errors; giving up");
      } else {
         WARNING("FUSE error mounting SquashFS; will retry");
         sleep(1);
      }
}