@@ -174,9 +174,9 @@ char **bind_mount_paths = NULL;
174174/** Function prototypes (private) **/
175175
176176void bind_mount (const char * src , const char * dst , enum bind_dep ,
177- const char * newroot , unsigned long flags );
177+ const char * newroot , unsigned long flags , const char * scratch );
178178void bind_mounts (const struct bind * binds , const char * newroot ,
179- unsigned long flags );
179+ unsigned long flags , const char * scratch );
180180void enter_udss (struct container * c );
181181#ifdef HAVE_SECCOMP
182182void iw (struct sock_fprog * p , int i ,
@@ -197,7 +197,7 @@ void tmpfs_mount(const char *dst, const char *newroot, const char *data);
197197
198198/* Bind-mount the given path into the container image. */
199199void bind_mount (const char * src , const char * dst , enum bind_dep dep ,
200- const char * newroot , unsigned long flags )
200+ const char * newroot , unsigned long flags , const char * scratch )
201201{
202202 char * dst_fullc , * newrootc ;
203203 char * dst_full = cat (newroot , dst );
@@ -218,7 +218,7 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep,
218218 case BD_OPTIONAL :
219219 return ;
220220 case BD_MAKE_DST :
221- mkdirs (newroot , dst , bind_mount_paths );
221+ mkdirs (newroot , dst , bind_mount_paths , scratch );
222222 break ;
223223 }
224224
@@ -235,10 +235,11 @@ void bind_mount(const char *src, const char *dst, enum bind_dep dep,
235235
236236/* Bind-mount a null-terminated array of struct bind objects. */
237237void bind_mounts (const struct bind * binds , const char * newroot ,
238- unsigned long flags )
238+ unsigned long flags , const char * scratch )
239239{
240240 for (int i = 0 ; binds [i ].src != NULL ; i ++ )
241- bind_mount (binds [i ].src , binds [i ].dst , binds [i ].dep , newroot , flags );
241+ bind_mount (binds [i ].src , binds [i ].dst , binds [i ].dep ,
242+ newroot , flags , scratch );
242243}
243244
244245/* Set up new namespaces or join existing namespaces. */
@@ -269,67 +270,93 @@ void containerize(struct container *c)
269270
270271}
271272
272- /* Enter the UDSS. After this, we are inside the UDSS.
273+ /* Enter the new root (UDSS). On entry, the namespaces are set up, and this
274+ does the mounting and filesystem setup.
273275
274276 Note that pivot_root(2) requires a complex dance to work, i.e., to avoid
275277 multiple undocumented error conditions. This dance is explained in detail
276278 in bin/ch-checkns.c. */
277279void enter_udss (struct container * c )
278280{
279- char * newroot_parent , * newroot_base ;
281+ char * nr_parent , * nr_base , * mkdir_scratch ;
280282
281283 LOG_IDS ;
282-
283- path_split (c -> newroot , & newroot_parent , & newroot_base );
284-
285- // Claim new root for this namespace. We do need both calls to avoid
286- // pivot_root(2) failing with EBUSY later.
287- bind_mount (c -> newroot , c -> newroot , BD_REQUIRED , "/" , MS_PRIVATE );
288- bind_mount (newroot_parent , newroot_parent , BD_REQUIRED , "/" , MS_PRIVATE );
284+ mkdir_scratch = NULL ;
285+ path_split (c -> newroot , & nr_parent , & nr_base );
286+
287+ // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do
288+ // need both calls to avoid pivot_root(2) failing with EBUSY later.
289+ DEBUG ("claiming new root for this namespace" )
290+ bind_mount (c -> newroot , c -> newroot , BD_REQUIRED , "/" , MS_PRIVATE , NULL );
291+ bind_mount (nr_parent , nr_parent , BD_REQUIRED , "/" , MS_PRIVATE , NULL );
292+ // Re-mount new root read-only unless --write or already read-only.
293+ if (!c -> writable && !(access (c -> newroot , W_OK ) == -1 && errno == EROFS )) {
294+ unsigned long flags = path_mount_flags (c -> newroot )
295+ | MS_REMOUNT // Re-mount ...
296+ | MS_BIND // only this mount point ...
297+ | MS_RDONLY ; // read-only.
298+ Z_ (mount (NULL , c -> newroot , NULL , flags , NULL ));
299+ }
300+ // Overlay a tmpfs if --write-fake. See for useful details:
301+ // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html
302+ // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html
303+ if (c -> overlay_size != NULL ) {
304+ VERBOSE ("overlaying tmpfs for --write-fake (%s)" , c -> overlay_size );
305+ char * options ;
306+ T_ (1 <= asprintf (& options , "size=%s" , c -> overlay_size ));
307+ Zf (mount (NULL , "/mnt" , "tmpfs" , 0 , options ), // host should have /mnt
308+ "cannot mount tmpfs for overlay" );
309+ free (options );
310+ Z_ (mkdir ("/mnt/upper" , 0700 ));
311+ Z_ (mkdir ("/mnt/work" , 0700 ));
312+ Z_ (mkdir ("/mnt/merged" , 0700 ));
313+ mkdir_scratch = "/mnt/mkdir_overmount" ;
314+ Z_ (mkdir (mkdir_scratch , 0700 ));
315+ T_ (1 <= asprintf (& options , "lowerdir=%s,upperdir=%s,workdir=%s,"
316+ "index=on,userxattr,volatile" ,
317+ c -> newroot , "/mnt/upper" , "/mnt/work" ));
318+ // update newroot
319+ c -> newroot = "/mnt/merged" ;
320+ free (nr_parent );
321+ free (nr_base );
322+ path_split (c -> newroot , & nr_parent , & nr_base );
323+ Zf (mount (NULL , c -> newroot , "overlay" , 0 , options ), "can't overlay" );
324+ VERBOSE ("newroot updated: %s" , c -> newroot );
325+ free (options );
326+ }
327+ DEBUG ("starting bind-mounts" );
289328 // Bind-mount default files and directories.
290- bind_mounts (BINDS_DEFAULT , c -> newroot , MS_RDONLY );
329+ bind_mounts (BINDS_DEFAULT , c -> newroot , MS_RDONLY , NULL );
291330 // /etc/passwd and /etc/group.
292331 if (!c -> private_passwd )
293332 setup_passwd (c );
294333 // Container /tmp.
295334 if (c -> private_tmp ) {
296335 tmpfs_mount ("/tmp" , c -> newroot , NULL );
297336 } else {
298- bind_mount (host_tmp , "/tmp" , BD_REQUIRED , c -> newroot , 0 );
337+ bind_mount (host_tmp , "/tmp" , BD_REQUIRED , c -> newroot , 0 , NULL );
299338 }
300- // Container /home.
339+ // Bind-mount user’s home directory at /home/$USER if requested .
301340 if (c -> host_home ) {
302- char * newhome ;
303- // Mount tmpfs on guest /home because guest root may be read-only.
304- tmpfs_mount ("/home" , c -> newroot , "size=4m" );
305- // Bind-mount user's home directory at /home/$USER.
306- newhome = cat ("/home/" , username );
307- Z_ (mkdir (cat (c -> newroot , newhome ), 0755 ));
308- bind_mount (c -> host_home , newhome , BD_REQUIRED , c -> newroot , 0 );
309- }
310- // Re-mount new root read-only unless --write or already read-only.
311- if (!c -> writable && !(access (c -> newroot , W_OK ) == -1 && errno == EROFS )) {
312- unsigned long flags = path_mount_flags (c -> newroot )
313- | MS_REMOUNT // Re-mount ...
314- | MS_BIND // only this mount point ...
315- | MS_RDONLY ; // read-only.
316- Zf (mount (NULL , c -> newroot , NULL , flags , NULL ),
317- "can't re-mount image read-only (is it on NFS?)" );
341+ T_ (c -> overlay_size != NULL );
342+ bind_mount (c -> host_home , cat ("/home/" , username ),
343+ BD_MAKE_DST , c -> newroot , 0 , mkdir_scratch );
318344 }
319345 // Bind-mount user-specified directories.
320- bind_mounts (c -> binds , c -> newroot , 0 );
321- // Overmount / to avoid EINVAL if it' s a rootfs.
322- Z_ (chdir (newroot_parent ));
323- Z_ (mount (newroot_parent , "/" , NULL , MS_MOVE , NULL ));
346+ bind_mounts (c -> binds , c -> newroot , 0 , mkdir_scratch );
347+ // Overmount / to avoid EINVAL if it’ s a rootfs.
348+ Z_ (chdir (nr_parent ));
349+ Z_ (mount (nr_parent , "/" , NULL , MS_MOVE , NULL ));
324350 Z_ (chroot ("." ));
325- c -> newroot = cat ("/" , newroot_base );
326- // Pivot into the new root. Use /dev because it's available even in
351+ // Pivot into the new root. Use /dev because it’s available even in
327352 // extremely minimal images.
353+ c -> newroot = cat ("/" , nr_base );
328354 Zf (chdir (c -> newroot ), "can't chdir into new root" );
329- Zf (syscall (SYS_pivot_root , c -> newroot , cat (c -> newroot , "/ dev" )),
355+ Zf (syscall (SYS_pivot_root , c -> newroot , path_join (c -> newroot , "dev" )),
330356 "can't pivot_root(2)" );
331357 Zf (chroot ("." ), "can't chroot(2) into new root" );
332358 Zf (umount2 ("/dev" , MNT_DETACH ), "can't umount old root" );
359+ DEBUG ("pivot_root(2) dance successful" )
333360}
334361
335362/* Return image type of path, or exit with error if not a valid type. */
@@ -700,7 +727,7 @@ void setup_passwd(const struct container *c)
700727 }
701728 }
702729 Z_ (close (fd ));
703- bind_mount (path , "/etc/passwd" , BD_REQUIRED , c -> newroot , 0 );
730+ bind_mount (path , "/etc/passwd" , BD_REQUIRED , c -> newroot , 0 , NULL );
704731 Z_ (unlink (path ));
705732
706733 // /etc/group
@@ -723,7 +750,7 @@ void setup_passwd(const struct container *c)
723750 }
724751 }
725752 Z_ (close (fd ));
726- bind_mount (path , "/etc/group" , BD_REQUIRED , c -> newroot , 0 );
753+ bind_mount (path , "/etc/group" , BD_REQUIRED , c -> newroot , 0 , NULL );
727754 Z_ (unlink (path ));
728755}
729756
0 commit comments