diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 29a32a1c3c..a9c076085a 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -14,6 +14,8 @@ import ( "slices" "strconv" "strings" + "sync" + "syscall" "time" metadata "github.com/checkpoint-restore/checkpointctl/lib" @@ -2362,6 +2364,75 @@ func (c *Container) setupOCIHooks(ctx context.Context, config *spec.Spec) (map[s return allHooks, nil } +// getRootPathForOCI returns the root path to use for the OCI runtime. +// If the current user is mapped in the container user namespace, then it returns +// the container's mountpoint directly from the storage. +// Otherwise, it returns an intermediate mountpoint that is accessible to anyone. +func (c *Container) getRootPathForOCI() (string, error) { + if hasCurrentUserMapped(c) { + return c.state.Mountpoint, nil + } + return c.getIntermediateMountpointUser() +} + +var ( + intermediateMountPoint string + intermediateMountPointErr error + intermediateMountPointSync sync.Mutex +) + +// getIntermediateMountpointUser returns a path that is accessible to everyone. It must be on TMPDIR since +// the runroot/tmpdir used by libpod are accessible only to the owner. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) recreateIntermediateMountpointUser() (string, error) { + uid := os.Geteuid() + gid := os.Getegid() + for i := 0; ; i++ { + tmpDir := os.Getenv("TMPDIR") + if tmpDir == "" { + tmpDir = "/tmp" + } + dir := filepath.Join(tmpDir, fmt.Sprintf("intermediate-mountpoint-%d.%d", rootless.GetRootlessUID(), i)) + err := os.Mkdir(dir, 0755) + if err != nil { + if !errors.Is(err, os.ErrExist) { + return "", err + } + st, err2 := os.Stat(dir) + if err2 != nil { + return "", err + } + sys := st.Sys().(*syscall.Stat_t) + if !st.IsDir() || sys.Uid != uint32(uid) || sys.Gid != uint32(gid) { + continue + } + } + return dir, nil + } +} + +// getIntermediateMountpointUser returns a path that is accessible to everyone. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) getIntermediateMountpointUser() (string, error) { + intermediateMountPointSync.Lock() + defer intermediateMountPointSync.Unlock() + + if intermediateMountPoint == "" || fileutils.Exists(intermediateMountPoint) != nil { + return c.recreateIntermediateMountpointUser() + } + + // update the timestamp to prevent systemd-tmpfiles from removing it + now := time.Now() + if err := os.Chtimes(intermediateMountPoint, now, now); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return c.recreateIntermediateMountpointUser() + } + } + return intermediateMountPoint, intermediateMountPointErr +} + // mount mounts the container's root filesystem func (c *Container) mount() (string, error) { if c.state.State == define.ContainerStateRemoving { diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go index 216074e130..f6500c6362 100644 --- a/libpod/container_internal_common.go +++ b/libpod/container_internal_common.go @@ -565,7 +565,11 @@ func (c *Container) generateSpec(ctx context.Context) (s *spec.Spec, cleanupFunc return nil, nil, err } - g.SetRootPath(c.state.Mountpoint) + rootPath, err := c.getRootPathForOCI() + if err != nil { + return nil, nil, err + } + g.SetRootPath(rootPath) g.AddAnnotation("org.opencontainers.image.stopSignal", strconv.FormatUint(uint64(c.config.StopSignal), 10)) if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index ff0e790860..c32fba46e2 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -184,15 +184,10 @@ func hasCurrentUserMapped(ctr *Container) bool { // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { if !hasCurrentUserMapped(ctr) { - if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil { - return 0, err - } - // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - return r.createRootlessContainer(ctr, restoreOptions) - } + hideFiles := !ctr.config.Privileged && !rootless.IsRootless() + return r.createRootlessContainer(ctr, restoreOptions, hideFiles) } return r.createOCIContainer(ctr, restoreOptions) } @@ -972,28 +967,6 @@ func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntime return &conmon, &ocirt, nil } -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - // Wait for a container which has been sent a signal to stop func waitContainerStop(ctr *Container, timeout time.Duration) error { return waitPidStop(ctr.state.PID, timeout) diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go index 5f113f5cba..f681f785a1 100644 --- a/libpod/oci_conmon_freebsd.go +++ b/libpod/oci_conmon_freebsd.go @@ -8,7 +8,7 @@ import ( "os/exec" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") } diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 91090bba2f..05dc65d360 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -3,7 +3,9 @@ package libpod import ( + "errors" "fmt" + "io/fs" "os" "os/exec" "path/filepath" @@ -25,7 +27,7 @@ import ( "golang.org/x/sys/unix" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { type result struct { restoreDuration int64 err error @@ -40,6 +42,11 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption } defer errorhandling.CloseQuiet(fd) + rootPath, err := ctr.getRootPathForOCI() + if err != nil { + return 0, err + } + // create a new mountns on the current thread if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { return 0, err @@ -54,26 +61,69 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption logrus.Errorf("Unable to reset the previous mount namespace: %q", err) } }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - mounts, err := pmount.GetMounts() if err != nil { return 0, err } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue + if rootPath != "" { + byMountpoint := make(map[string]*pmount.Info) + for _, m := range mounts { + byMountpoint[m.Mountpoint] = m } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + isShared := false + var parentMount string + for dir := filepath.Dir(rootPath); ; dir = filepath.Dir(dir) { + if m, found := byMountpoint[dir]; found { + parentMount = dir + for _, o := range strings.Split(m.Optional, ",") { + opt := strings.Split(o, ":") + if opt[0] == "shared" { + isShared = true + break + } + } + break + } + if dir == "/" { + return 0, fmt.Errorf("cannot find mountpoint for the root path") + } + } + + // do not propagate the bind mount on the parent mount namespace + if err := unix.Mount("", parentMount, "", unix.MS_SLAVE, ""); err != nil { + return 0, fmt.Errorf("failed to make %s slave: %w", parentMount, err) + } + + // bind mount the containers' mount path to the path where the OCI runtime expects it to be + if err := unix.Mount(ctr.state.Mountpoint, rootPath, "", unix.MS_BIND, ""); err != nil { + return 0, fmt.Errorf("failed to bind mount %s to %s: %w", ctr.state.Mountpoint, rootPath, err) + } + + if isShared { + // we need to restore the shared propagation of the parent mount so that we don't break -v $SRC:$DST:shared in the container + // if $SRC is on the same mount as the root path + if err := unix.Mount("", parentMount, "", unix.MS_SHARED, ""); err != nil { + return 0, fmt.Errorf("failed to restore MS_SHARED propagation for %s: %w", parentMount, err) + } + } + } + + if hideFiles { + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } } } return r.createOCIContainer(ctr, restoreOptions)