cgroupns: private cgroupns on cgroupv1 breaks --systemd

On cgroup v1 we need to mount only the systemd named hierarchy as
writeable, so we configure the OCI runtime to mount /sys/fs/cgroup as
read-only and on top of that bind mount /sys/fs/cgroup/systemd.

But when we use a private cgroupns, we cannot do that since we don't
know the final cgroup path.

Also, do not override the mount if there is already one for
/sys/fs/cgroup/systemd.

Closes: https://github.com/containers/podman/issues/17727

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
Giuseppe Scrivano 2023-03-10 21:24:21 +01:00
parent 01fd5bcc30
commit 2d1f4a8bff
No known key found for this signature in database
GPG Key ID: 67E38F7A8BA21772
3 changed files with 62 additions and 40 deletions

View File

@ -242,17 +242,17 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro
return err
}
hasCgroupNs := false
for _, ns := range c.config.Spec.Linux.Namespaces {
if ns.Type == spec.CgroupNamespace {
hasCgroupNs = true
break
}
}
if unified {
g.RemoveMount("/sys/fs/cgroup")
hasCgroupNs := false
for _, ns := range c.config.Spec.Linux.Namespaces {
if ns.Type == spec.CgroupNamespace {
hasCgroupNs = true
break
}
}
var systemdMnt spec.Mount
if hasCgroupNs {
systemdMnt = spec.Mount{
@ -271,40 +271,46 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro
}
g.AddMount(systemdMnt)
} else {
mountOptions := []string{"bind", "rprivate"}
skipMount := false
var statfs unix.Statfs_t
if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil {
if errors.Is(err, os.ErrNotExist) {
// If the mount is missing on the host, we cannot bind mount it so
// just skip it.
skipMount = true
}
mountOptions = append(mountOptions, "nodev", "noexec", "nosuid")
} else {
if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV {
mountOptions = append(mountOptions, "nodev")
}
if statfs.Flags&unix.MS_NOEXEC == unix.MS_NOEXEC {
mountOptions = append(mountOptions, "noexec")
}
if statfs.Flags&unix.MS_NOSUID == unix.MS_NOSUID {
mountOptions = append(mountOptions, "nosuid")
}
if statfs.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
mountOptions = append(mountOptions, "ro")
}
hasSystemdMount := MountExists(mounts, "/sys/fs/cgroup/systemd")
if hasCgroupNs && !hasSystemdMount {
return errors.New("cgroup namespace is not supported with cgroup v1 and systemd mode")
}
if !skipMount {
systemdMnt := spec.Mount{
Destination: "/sys/fs/cgroup/systemd",
Type: "bind",
Source: "/sys/fs/cgroup/systemd",
Options: mountOptions,
mountOptions := []string{"bind", "rprivate"}
if !hasSystemdMount {
skipMount := hasSystemdMount
var statfs unix.Statfs_t
if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil {
if errors.Is(err, os.ErrNotExist) {
// If the mount is missing on the host, we cannot bind mount it so
// just skip it.
skipMount = true
}
mountOptions = append(mountOptions, "nodev", "noexec", "nosuid")
} else {
if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV {
mountOptions = append(mountOptions, "nodev")
}
if statfs.Flags&unix.MS_NOEXEC == unix.MS_NOEXEC {
mountOptions = append(mountOptions, "noexec")
}
if statfs.Flags&unix.MS_NOSUID == unix.MS_NOSUID {
mountOptions = append(mountOptions, "nosuid")
}
if statfs.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
mountOptions = append(mountOptions, "ro")
}
}
if !skipMount {
systemdMnt := spec.Mount{
Destination: "/sys/fs/cgroup/systemd",
Type: "bind",
Source: "/sys/fs/cgroup/systemd",
Options: mountOptions,
}
g.AddMount(systemdMnt)
g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent")
}
g.AddMount(systemdMnt)
g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent")
}
}

View File

@ -280,6 +280,13 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
is "${container_uuid}" "${output:0:32}" "UUID should be first 32 chars of Container id"
}
@test "podman --systemd fails on cgroup v1 with a private cgroupns" {
skip_if_cgroupsv2
run_podman 126 run --systemd=always --cgroupns=private $IMAGE true
assert "$output" =~ ".*cgroup namespace is not supported with cgroup v1 and systemd mode"
}
# https://github.com/containers/podman/issues/13153
@test "podman rootless-netns slirp4netns process should be in different cgroup" {
is_rootless || skip "only meaningful for rootless"

View File

@ -497,6 +497,15 @@ function skip_if_cgroupsv1() {
fi
}
#######################
# skip_if_cgroupsv2 # ...with an optional message
#######################
function skip_if_cgroupsv2() {
if is_cgroupsv2; then
skip "${1:-test requires cgroupsv1}"
fi
}
######################
# skip_if_rootless_cgroupsv1 # ...with an optional message
######################