From 49eb5af3013ef64bd017a1fd545c802e3eb230a1 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 19 Jun 2024 23:49:37 +0200 Subject: [PATCH] libpod: intermediate mount if UID not mapped into the userns if the current user is not mapped into the new user namespace, use an intermediate mount to allow the mount point to be accessible instead of opening up all the parent directories for the mountpoint. Closes: https://github.com/containers/podman/issues/23028 Signed-off-by: Giuseppe Scrivano --- libpod/container_internal.go | 71 +++++++++++++++++++++++++ libpod/container_internal_common.go | 6 ++- libpod/oci_conmon_common.go | 31 +---------- libpod/oci_conmon_freebsd.go | 2 +- libpod/oci_conmon_linux.go | 82 +++++++++++++++++++++++------ 5 files changed, 145 insertions(+), 47 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 29a32a1c3c..a9c076085a 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -14,6 +14,8 @@ import ( "slices" "strconv" "strings" + "sync" + "syscall" "time" metadata "github.com/checkpoint-restore/checkpointctl/lib" @@ -2362,6 +2364,75 @@ func (c *Container) setupOCIHooks(ctx context.Context, config *spec.Spec) (map[s return allHooks, nil } +// getRootPathForOCI returns the root path to use for the OCI runtime. +// If the current user is mapped in the container user namespace, then it returns +// the container's mountpoint directly from the storage. +// Otherwise, it returns an intermediate mountpoint that is accessible to anyone. +func (c *Container) getRootPathForOCI() (string, error) { + if hasCurrentUserMapped(c) { + return c.state.Mountpoint, nil + } + return c.getIntermediateMountpointUser() +} + +var ( + intermediateMountPoint string + intermediateMountPointErr error + intermediateMountPointSync sync.Mutex +) + +// getIntermediateMountpointUser returns a path that is accessible to everyone. It must be on TMPDIR since +// the runroot/tmpdir used by libpod are accessible only to the owner. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) recreateIntermediateMountpointUser() (string, error) { + uid := os.Geteuid() + gid := os.Getegid() + for i := 0; ; i++ { + tmpDir := os.Getenv("TMPDIR") + if tmpDir == "" { + tmpDir = "/tmp" + } + dir := filepath.Join(tmpDir, fmt.Sprintf("intermediate-mountpoint-%d.%d", rootless.GetRootlessUID(), i)) + err := os.Mkdir(dir, 0755) + if err != nil { + if !errors.Is(err, os.ErrExist) { + return "", err + } + st, err2 := os.Stat(dir) + if err2 != nil { + return "", err + } + sys := st.Sys().(*syscall.Stat_t) + if !st.IsDir() || sys.Uid != uint32(uid) || sys.Gid != uint32(gid) { + continue + } + } + return dir, nil + } +} + +// getIntermediateMountpointUser returns a path that is accessible to everyone. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) getIntermediateMountpointUser() (string, error) { + intermediateMountPointSync.Lock() + defer intermediateMountPointSync.Unlock() + + if intermediateMountPoint == "" || fileutils.Exists(intermediateMountPoint) != nil { + return c.recreateIntermediateMountpointUser() + } + + // update the timestamp to prevent systemd-tmpfiles from removing it + now := time.Now() + if err := os.Chtimes(intermediateMountPoint, now, now); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return c.recreateIntermediateMountpointUser() + } + } + return intermediateMountPoint, intermediateMountPointErr +} + // mount mounts the container's root filesystem func (c *Container) mount() (string, error) { if c.state.State == define.ContainerStateRemoving { diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go index 216074e130..f6500c6362 100644 --- a/libpod/container_internal_common.go +++ b/libpod/container_internal_common.go @@ -565,7 +565,11 @@ func (c *Container) generateSpec(ctx context.Context) (s *spec.Spec, cleanupFunc return nil, nil, err } - g.SetRootPath(c.state.Mountpoint) + rootPath, err := c.getRootPathForOCI() + if err != nil { + return nil, nil, err + } + g.SetRootPath(rootPath) g.AddAnnotation("org.opencontainers.image.stopSignal", strconv.FormatUint(uint64(c.config.StopSignal), 10)) if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index ff0e790860..c32fba46e2 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -184,15 +184,10 @@ func hasCurrentUserMapped(ctr *Container) bool { // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { if !hasCurrentUserMapped(ctr) { - if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil { - return 0, err - } - // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - return r.createRootlessContainer(ctr, restoreOptions) - } + hideFiles := !ctr.config.Privileged && !rootless.IsRootless() + return r.createRootlessContainer(ctr, restoreOptions, hideFiles) } return r.createOCIContainer(ctr, restoreOptions) } @@ -972,28 +967,6 @@ func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntime return &conmon, &ocirt, nil } -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - // Wait for a container which has been sent a signal to stop func waitContainerStop(ctr *Container, timeout time.Duration) error { return waitPidStop(ctr.state.PID, timeout) diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go index 5f113f5cba..f681f785a1 100644 --- a/libpod/oci_conmon_freebsd.go +++ b/libpod/oci_conmon_freebsd.go @@ -8,7 +8,7 @@ import ( "os/exec" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") } diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 91090bba2f..05dc65d360 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -3,7 +3,9 @@ package libpod import ( + "errors" "fmt" + "io/fs" "os" "os/exec" "path/filepath" @@ -25,7 +27,7 @@ import ( "golang.org/x/sys/unix" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { type result struct { restoreDuration int64 err error @@ -40,6 +42,11 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption } defer errorhandling.CloseQuiet(fd) + rootPath, err := ctr.getRootPathForOCI() + if err != nil { + return 0, err + } + // create a new mountns on the current thread if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { return 0, err @@ -54,26 +61,69 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption logrus.Errorf("Unable to reset the previous mount namespace: %q", err) } }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - mounts, err := pmount.GetMounts() if err != nil { return 0, err } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue + if rootPath != "" { + byMountpoint := make(map[string]*pmount.Info) + for _, m := range mounts { + byMountpoint[m.Mountpoint] = m } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + isShared := false + var parentMount string + for dir := filepath.Dir(rootPath); ; dir = filepath.Dir(dir) { + if m, found := byMountpoint[dir]; found { + parentMount = dir + for _, o := range strings.Split(m.Optional, ",") { + opt := strings.Split(o, ":") + if opt[0] == "shared" { + isShared = true + break + } + } + break + } + if dir == "/" { + return 0, fmt.Errorf("cannot find mountpoint for the root path") + } + } + + // do not propagate the bind mount on the parent mount namespace + if err := unix.Mount("", parentMount, "", unix.MS_SLAVE, ""); err != nil { + return 0, fmt.Errorf("failed to make %s slave: %w", parentMount, err) + } + + // bind mount the containers' mount path to the path where the OCI runtime expects it to be + if err := unix.Mount(ctr.state.Mountpoint, rootPath, "", unix.MS_BIND, ""); err != nil { + return 0, fmt.Errorf("failed to bind mount %s to %s: %w", ctr.state.Mountpoint, rootPath, err) + } + + if isShared { + // we need to restore the shared propagation of the parent mount so that we don't break -v $SRC:$DST:shared in the container + // if $SRC is on the same mount as the root path + if err := unix.Mount("", parentMount, "", unix.MS_SHARED, ""); err != nil { + return 0, fmt.Errorf("failed to restore MS_SHARED propagation for %s: %w", parentMount, err) + } + } + } + + if hideFiles { + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } } } return r.createOCIContainer(ctr, restoreOptions)