libpod: intermediate mount if UID not mapped into the userns

if the current user is not mapped into the new user namespace, use an
intermediate mount to allow the mount point to be accessible instead
of opening up all the parent directories for the mountpoint.

Closes: https://github.com/containers/podman/issues/23028

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
Giuseppe Scrivano 2024-06-19 23:49:37 +02:00
parent 08a8429459
commit 49eb5af301
No known key found for this signature in database
GPG Key ID: 67E38F7A8BA21772
5 changed files with 145 additions and 47 deletions

View File

@ -14,6 +14,8 @@ import (
"slices"
"strconv"
"strings"
"sync"
"syscall"
"time"
metadata "github.com/checkpoint-restore/checkpointctl/lib"
@ -2362,6 +2364,75 @@ func (c *Container) setupOCIHooks(ctx context.Context, config *spec.Spec) (map[s
return allHooks, nil
}
// getRootPathForOCI returns the root path to use for the OCI runtime.
// If the current user is mapped in the container user namespace, then it returns
// the container's mountpoint directly from the storage.
// Otherwise, it returns an intermediate mountpoint that is accessible to anyone.
func (c *Container) getRootPathForOCI() (string, error) {
if hasCurrentUserMapped(c) {
return c.state.Mountpoint, nil
}
return c.getIntermediateMountpointUser()
}
var (
intermediateMountPoint string
intermediateMountPointErr error
intermediateMountPointSync sync.Mutex
)
// getIntermediateMountpointUser returns a path that is accessible to everyone. It must be on TMPDIR since
// the runroot/tmpdir used by libpod are accessible only to the owner.
// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID.
// The path can be used by different containers, so a mount must be created only in a private mount namespace.
func (c *Container) recreateIntermediateMountpointUser() (string, error) {
uid := os.Geteuid()
gid := os.Getegid()
for i := 0; ; i++ {
tmpDir := os.Getenv("TMPDIR")
if tmpDir == "" {
tmpDir = "/tmp"
}
dir := filepath.Join(tmpDir, fmt.Sprintf("intermediate-mountpoint-%d.%d", rootless.GetRootlessUID(), i))
err := os.Mkdir(dir, 0755)
if err != nil {
if !errors.Is(err, os.ErrExist) {
return "", err
}
st, err2 := os.Stat(dir)
if err2 != nil {
return "", err
}
sys := st.Sys().(*syscall.Stat_t)
if !st.IsDir() || sys.Uid != uint32(uid) || sys.Gid != uint32(gid) {
continue
}
}
return dir, nil
}
}
// getIntermediateMountpointUser returns a path that is accessible to everyone.
// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID.
// The path can be used by different containers, so a mount must be created only in a private mount namespace.
func (c *Container) getIntermediateMountpointUser() (string, error) {
intermediateMountPointSync.Lock()
defer intermediateMountPointSync.Unlock()
if intermediateMountPoint == "" || fileutils.Exists(intermediateMountPoint) != nil {
return c.recreateIntermediateMountpointUser()
}
// update the timestamp to prevent systemd-tmpfiles from removing it
now := time.Now()
if err := os.Chtimes(intermediateMountPoint, now, now); err != nil {
if !errors.Is(err, os.ErrNotExist) {
return c.recreateIntermediateMountpointUser()
}
}
return intermediateMountPoint, intermediateMountPointErr
}
// mount mounts the container's root filesystem
func (c *Container) mount() (string, error) {
if c.state.State == define.ContainerStateRemoving {

View File

@ -565,7 +565,11 @@ func (c *Container) generateSpec(ctx context.Context) (s *spec.Spec, cleanupFunc
return nil, nil, err
}
g.SetRootPath(c.state.Mountpoint)
rootPath, err := c.getRootPathForOCI()
if err != nil {
return nil, nil, err
}
g.SetRootPath(rootPath)
g.AddAnnotation("org.opencontainers.image.stopSignal", strconv.FormatUint(uint64(c.config.StopSignal), 10))
if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists {

View File

@ -184,15 +184,10 @@ func hasCurrentUserMapped(ctr *Container) bool {
// CreateContainer creates a container.
func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
if !hasCurrentUserMapped(ctr) {
if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil {
return 0, err
}
// if we are running a non privileged container, be sure to umount some kernel paths so they are not
// bind mounted inside the container at all.
if !ctr.config.Privileged && !rootless.IsRootless() {
return r.createRootlessContainer(ctr, restoreOptions)
}
hideFiles := !ctr.config.Privileged && !rootless.IsRootless()
return r.createRootlessContainer(ctr, restoreOptions, hideFiles)
}
return r.createOCIContainer(ctr, restoreOptions)
}
@ -972,28 +967,6 @@ func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntime
return &conmon, &ocirt, nil
}
// makeAccessible changes the path permission and each parent directory to have --x--x--x
func makeAccessible(path string, uid, gid int) error {
for ; path != "/"; path = filepath.Dir(path) {
st, err := os.Stat(path)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid {
continue
}
if st.Mode()&0111 != 0111 {
if err := os.Chmod(path, st.Mode()|0111); err != nil {
return err
}
}
}
return nil
}
// Wait for a container which has been sent a signal to stop
func waitContainerStop(ctr *Container, timeout time.Duration) error {
return waitPidStop(ctr.state.PID, timeout)

View File

@ -8,7 +8,7 @@ import (
"os/exec"
)
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) {
return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer")
}

View File

@ -3,7 +3,9 @@
package libpod
import (
"errors"
"fmt"
"io/fs"
"os"
"os/exec"
"path/filepath"
@ -25,7 +27,7 @@ import (
"golang.org/x/sys/unix"
)
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) {
type result struct {
restoreDuration int64
err error
@ -40,6 +42,11 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption
}
defer errorhandling.CloseQuiet(fd)
rootPath, err := ctr.getRootPathForOCI()
if err != nil {
return 0, err
}
// create a new mountns on the current thread
if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
return 0, err
@ -54,7 +61,54 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption
logrus.Errorf("Unable to reset the previous mount namespace: %q", err)
}
}()
mounts, err := pmount.GetMounts()
if err != nil {
return 0, err
}
if rootPath != "" {
byMountpoint := make(map[string]*pmount.Info)
for _, m := range mounts {
byMountpoint[m.Mountpoint] = m
}
isShared := false
var parentMount string
for dir := filepath.Dir(rootPath); ; dir = filepath.Dir(dir) {
if m, found := byMountpoint[dir]; found {
parentMount = dir
for _, o := range strings.Split(m.Optional, ",") {
opt := strings.Split(o, ":")
if opt[0] == "shared" {
isShared = true
break
}
}
break
}
if dir == "/" {
return 0, fmt.Errorf("cannot find mountpoint for the root path")
}
}
// do not propagate the bind mount on the parent mount namespace
if err := unix.Mount("", parentMount, "", unix.MS_SLAVE, ""); err != nil {
return 0, fmt.Errorf("failed to make %s slave: %w", parentMount, err)
}
// bind mount the containers' mount path to the path where the OCI runtime expects it to be
if err := unix.Mount(ctr.state.Mountpoint, rootPath, "", unix.MS_BIND, ""); err != nil {
return 0, fmt.Errorf("failed to bind mount %s to %s: %w", ctr.state.Mountpoint, rootPath, err)
}
if isShared {
// we need to restore the shared propagation of the parent mount so that we don't break -v $SRC:$DST:shared in the container
// if $SRC is on the same mount as the root path
if err := unix.Mount("", parentMount, "", unix.MS_SHARED, ""); err != nil {
return 0, fmt.Errorf("failed to restore MS_SHARED propagation for %s: %w", parentMount, err)
}
}
}
if hideFiles {
// don't spread our mounts around. We are setting only /sys to be slave
// so that the cleanup process is still able to umount the storage and the
// changes are propagated to the host.
@ -62,20 +116,16 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption
if err != nil {
return 0, fmt.Errorf("cannot make /sys slave: %w", err)
}
mounts, err := pmount.GetMounts()
if err != nil {
return 0, err
}
for _, m := range mounts {
if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
continue
}
err = unix.Unmount(m.Mountpoint, 0)
if err != nil && !os.IsNotExist(err) {
if err != nil && !errors.Is(err, fs.ErrNotExist) {
return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err)
}
}
}
return r.createOCIContainer(ctr, restoreOptions)
}()
ch <- result{