Skip to content
This repository was archived by the owner on May 27, 2025. It is now read-only.

Commit 63f77b1

Browse files
authored
PR #1793: add --write-fake via unprivileged overlayfs
1 parent db386b9 commit 63f77b1

File tree

15 files changed

+621
-191
lines changed

15 files changed

+621
-191
lines changed

bin/ch-run.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ char *JOIN_CT_ENV[] = { "OMPI_COMM_WORLD_LOCAL_SIZE",
2727
char *JOIN_TAG_ENV[] = { "SLURM_STEP_ID",
2828
NULL };
2929

30+
/* Default overlaid tmpfs size. */
31+
char *WRITE_FAKE_DEFAULT = "12%";
32+
3033

3134
/** Command line options **/
3235

@@ -75,7 +78,9 @@ const struct argp_option options[] = {
7578
{ "verbose", 'v', 0, 0, "be more verbose (can be repeated)" },
7679
{ "version", 'V', 0, 0, "print version and exit" },
7780
{ "warnings", -16, "NUM", 0, "log NUM warnings and exit" },
78-
{ "write", 'w', 0, 0, "mount image read-write"},
81+
{ "write", 'w', 0, 0, "mount image read-write (avoid)"},
82+
{ "write-fake", 'W', "SIZE", OPTION_ARG_OPTIONAL,
83+
"overlay read-write tmpfs on top of image" },
7984
{ 0 }
8085
};
8186

@@ -155,6 +160,7 @@ int main(int argc, char *argv[])
155160
.join_ct = 0,
156161
.join_pid = 0,
157162
.join_tag = NULL,
163+
.overlay_size = NULL,
158164
.private_passwd = false,
159165
.private_tmp = false,
160166
.type = IMG_NONE,
@@ -173,7 +179,7 @@ int main(int argc, char *argv[])
173179
argp_help_fmt_set = true;
174180
else {
175181
argp_help_fmt_set = false;
176-
Z_ (setenv("ARGP_HELP_FMT", "opt-doc-col=25,no-dup-args-note", 0));
182+
Z_ (setenv("ARGP_HELP_FMT", "opt-doc-col=27,no-dup-args-note", 0));
177183
}
178184
Z_ (argp_parse(&argp, argc, argv, 0, &arg_next, &args));
179185
if (!argp_help_fmt_set)
@@ -457,6 +463,10 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
457463
break;
458464
case -12: // --home
459465
Tf (args->c.host_home = getenv("HOME"), "--home failed: $HOME not set");
466+
if (args->c.overlay_size == NULL) {
467+
VERBOSE("--home specified; also setting --write-fake");
468+
args->c.overlay_size = WRITE_FAKE_DEFAULT;
469+
}
460470
break;
461471
case -13: // --unsafe
462472
args->unsafe = true;
@@ -534,6 +544,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
534544
case 'w': // --write
535545
args->c.writable = true;
536546
break;
547+
case 'W': // --write-fake
548+
args->c.overlay_size = arg != NULL ? arg : WRITE_FAKE_DEFAULT;
549+
break;
537550
case ARGP_KEY_NO_ARGS:
538551
argp_state_help(state, stderr, ( ARGP_HELP_SHORT_USAGE
539552
| ARGP_HELP_PRE_DOC

bin/ch_core.c

Lines changed: 70 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,9 @@ char **bind_mount_paths = NULL;
174174
/** Function prototypes (private) **/
175175

176176
void bind_mount(const char *src, const char *dst, enum bind_dep,
177-
const char *newroot, unsigned long flags);
177+
const char *newroot, unsigned long flags, const char *scratch);
178178
void bind_mounts(const struct bind *binds, const char *newroot,
179-
unsigned long flags);
179+
unsigned long flags, const char * scratch);
180180
void enter_udss(struct container *c);
181181
#ifdef HAVE_SECCOMP
182182
void iw(struct sock_fprog *p, int i,
@@ -197,7 +197,7 @@ void tmpfs_mount(const char *dst, const char *newroot, const char *data);
197197

198198
/* Bind-mount the given path into the container image. */
199199
void bind_mount(const char *src, const char *dst, enum bind_dep dep,
200-
const char *newroot, unsigned long flags)
200+
const char *newroot, unsigned long flags, const char *scratch)
201201
{
202202
char *dst_fullc, *newrootc;
203203
char *dst_full = cat(newroot, dst);
@@ -218,7 +218,7 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep,
218218
case BD_OPTIONAL:
219219
return;
220220
case BD_MAKE_DST:
221-
mkdirs(newroot, dst, bind_mount_paths);
221+
mkdirs(newroot, dst, bind_mount_paths, scratch);
222222
break;
223223
}
224224

@@ -235,10 +235,11 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep,
235235

236236
/* Bind-mount a null-terminated array of struct bind objects. */
237237
void bind_mounts(const struct bind *binds, const char *newroot,
238-
unsigned long flags)
238+
unsigned long flags, const char * scratch)
239239
{
240240
for (int i = 0; binds[i].src != NULL; i++)
241-
bind_mount(binds[i].src, binds[i].dst, binds[i].dep, newroot, flags);
241+
bind_mount(binds[i].src, binds[i].dst, binds[i].dep,
242+
newroot, flags, scratch);
242243
}
243244

244245
/* Set up new namespaces or join existing namespaces. */
@@ -269,67 +270,93 @@ void containerize(struct container *c)
269270

270271
}
271272

272-
/* Enter the UDSS. After this, we are inside the UDSS.
273+
/* Enter the new root (UDSS). On entry, the namespaces are set up, and this
274+
does the mounting and filesystem setup.
273275
274276
Note that pivot_root(2) requires a complex dance to work, i.e., to avoid
275277
multiple undocumented error conditions. This dance is explained in detail
276278
in bin/ch-checkns.c. */
277279
void enter_udss(struct container *c)
278280
{
279-
char *newroot_parent, *newroot_base;
281+
char *nr_parent, *nr_base, *mkdir_scratch;
280282

281283
LOG_IDS;
282-
283-
path_split(c->newroot, &newroot_parent, &newroot_base);
284-
285-
// Claim new root for this namespace. We do need both calls to avoid
286-
// pivot_root(2) failing with EBUSY later.
287-
bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE);
288-
bind_mount(newroot_parent, newroot_parent, BD_REQUIRED, "/", MS_PRIVATE);
284+
mkdir_scratch = NULL;
285+
path_split(c->newroot, &nr_parent, &nr_base);
286+
287+
// Claim new root for this namespace. Despite MS_REC in bind_mount(), we do
288+
// need both calls to avoid pivot_root(2) failing with EBUSY later.
289+
DEBUG("claiming new root for this namespace")
290+
bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL);
291+
bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL);
292+
// Re-mount new root read-only unless --write or already read-only.
293+
if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) {
294+
unsigned long flags = path_mount_flags(c->newroot)
295+
| MS_REMOUNT // Re-mount ...
296+
| MS_BIND // only this mount point ...
297+
| MS_RDONLY; // read-only.
298+
Z_ (mount(NULL, c->newroot, NULL, flags, NULL));
299+
}
300+
// Overlay a tmpfs if --write-fake. See for useful details:
301+
// https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html
302+
// https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html
303+
if (c->overlay_size != NULL) {
304+
VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size);
305+
char *options;
306+
T_ (1 <= asprintf(&options, "size=%s", c->overlay_size));
307+
Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt
308+
"cannot mount tmpfs for overlay");
309+
free(options);
310+
Z_ (mkdir("/mnt/upper", 0700));
311+
Z_ (mkdir("/mnt/work", 0700));
312+
Z_ (mkdir("/mnt/merged", 0700));
313+
mkdir_scratch = "/mnt/mkdir_overmount";
314+
Z_ (mkdir(mkdir_scratch, 0700));
315+
T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s,"
316+
"index=on,userxattr,volatile",
317+
c->newroot, "/mnt/upper", "/mnt/work"));
318+
// update newroot
319+
c->newroot = "/mnt/merged";
320+
free(nr_parent);
321+
free(nr_base);
322+
path_split(c->newroot, &nr_parent, &nr_base);
323+
Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay");
324+
VERBOSE("newroot updated: %s", c->newroot);
325+
free(options);
326+
}
327+
DEBUG("starting bind-mounts");
289328
// Bind-mount default files and directories.
290-
bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY);
329+
bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL);
291330
// /etc/passwd and /etc/group.
292331
if (!c->private_passwd)
293332
setup_passwd(c);
294333
// Container /tmp.
295334
if (c->private_tmp) {
296335
tmpfs_mount("/tmp", c->newroot, NULL);
297336
} else {
298-
bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0);
337+
bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL);
299338
}
300-
// Container /home.
339+
// Bind-mount user’s home directory at /home/$USER if requested.
301340
if (c->host_home) {
302-
char *newhome;
303-
// Mount tmpfs on guest /home because guest root may be read-only.
304-
tmpfs_mount("/home", c->newroot, "size=4m");
305-
// Bind-mount user's home directory at /home/$USER.
306-
newhome = cat("/home/", username);
307-
Z_ (mkdir(cat(c->newroot, newhome), 0755));
308-
bind_mount(c->host_home, newhome, BD_REQUIRED, c->newroot, 0);
309-
}
310-
// Re-mount new root read-only unless --write or already read-only.
311-
if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) {
312-
unsigned long flags = path_mount_flags(c->newroot)
313-
| MS_REMOUNT // Re-mount ...
314-
| MS_BIND // only this mount point ...
315-
| MS_RDONLY; // read-only.
316-
Zf (mount(NULL, c->newroot, NULL, flags, NULL),
317-
"can't re-mount image read-only (is it on NFS?)");
341+
T_ (c->overlay_size != NULL);
342+
bind_mount(c->host_home, cat("/home/", username),
343+
BD_MAKE_DST, c->newroot, 0, mkdir_scratch);
318344
}
319345
// Bind-mount user-specified directories.
320-
bind_mounts(c->binds, c->newroot, 0);
321-
// Overmount / to avoid EINVAL if it's a rootfs.
322-
Z_ (chdir(newroot_parent));
323-
Z_ (mount(newroot_parent, "/", NULL, MS_MOVE, NULL));
346+
bind_mounts(c->binds, c->newroot, 0, mkdir_scratch);
347+
// Overmount / to avoid EINVAL if its a rootfs.
348+
Z_ (chdir(nr_parent));
349+
Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL));
324350
Z_ (chroot("."));
325-
c->newroot = cat("/", newroot_base);
326-
// Pivot into the new root. Use /dev because it's available even in
351+
// Pivot into the new root. Use /dev because it’s available even in
327352
// extremely minimal images.
353+
c->newroot = cat("/", nr_base);
328354
Zf (chdir(c->newroot), "can't chdir into new root");
329-
Zf (syscall(SYS_pivot_root, c->newroot, cat(c->newroot, "/dev")),
355+
Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")),
330356
"can't pivot_root(2)");
331357
Zf (chroot("."), "can't chroot(2) into new root");
332358
Zf (umount2("/dev", MNT_DETACH), "can't umount old root");
359+
DEBUG("pivot_root(2) dance successful")
333360
}
334361

335362
/* Return image type of path, or exit with error if not a valid type. */
@@ -700,7 +727,7 @@ void setup_passwd(const struct container *c)
700727
}
701728
}
702729
Z_ (close(fd));
703-
bind_mount(path, "/etc/passwd", BD_REQUIRED, c->newroot, 0);
730+
bind_mount(path, "/etc/passwd", BD_REQUIRED, c->newroot, 0, NULL);
704731
Z_ (unlink(path));
705732

706733
// /etc/group
@@ -723,7 +750,7 @@ void setup_passwd(const struct container *c)
723750
}
724751
}
725752
Z_ (close(fd));
726-
bind_mount(path, "/etc/group", BD_REQUIRED, c->newroot, 0);
753+
bind_mount(path, "/etc/group", BD_REQUIRED, c->newroot, 0, NULL);
727754
Z_ (unlink(path));
728755
}
729756

bin/ch_core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct container {
3939
int join_ct; // number of peers in a synchronized join
4040
pid_t join_pid; // process in existing namespace to join
4141
char *join_tag; // identifier for synchronized join
42+
char *overlay_size; // size of overlaid tmpfs (NULL for no overlay)
4243
bool private_passwd; // don't bind custom /etc/{passwd,group}
4344
bool private_tmp; // don't bind host's /tmp
4445
enum img_type type; // directory, SquashFS, etc.

bin/ch_fuse.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ void sq_fork(struct container *c)
120120
T_ (asprintf(&subdir, "/%s.ch/mnt", username) > 0);
121121
c->newroot = cat("/var/tmp", subdir);
122122
VERBOSE("using default mount point: %s", c->newroot);
123-
mkdirs("/var/tmp", subdir, NULL);
123+
mkdirs("/var/tmp", subdir, NULL, NULL);
124124
}
125125

126126
// Verify mount point exists and is a directory. (SquashFS file path

0 commit comments

Comments
 (0)