From ad8a96ab9559f41c7d39a50910154445166ebcb0 Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Mon, 13 Feb 2023 02:45:33 -0500 Subject: [PATCH] Support running nested SELinux container separation Currently Podman prevents SELinux container separation, when running within a container. This PR adds a new --security-opt label=nested When setting this option, Podman unmasks and mountsi /sys/fs/selinux into the containers making /sys/fs/selinux fully exposed. Secondly Podman sets the attribute run.oci.mount_context_type=rootcontext This attribute tells crun to mount volumes with rootcontext=MOUNTLABEL as opposed to context=MOUNTLABEL. With these two settings Podman inside the container is allowed to set its own SELinux labels on tmpfs file systems mounted into its parents container, while still being confined by SELinux. Thus you can have nested SELinux labeling inside of a container. Signed-off-by: Daniel J Walsh --- cmd/podman/containers/create.go | 2 +- docs/source/markdown/options/security-opt.md | 2 + libpod/container_config.go | 2 + libpod/container_internal_linux.go | 7 +- libpod/define/annotations.go | 7 +- libpod/options.go | 13 +++ libpod/runtime_volume_common.go | 6 +- pkg/specgen/generate/container_create.go | 3 + pkg/specgen/specgen.go | 4 + pkg/specgenutil/specgen.go | 95 +++++++++++--------- test/e2e/containers_conf_test.go | 39 ++++---- test/system/410-selinux.bats | 16 ++++ 12 files changed, 130 insertions(+), 66 deletions(-) diff --git a/cmd/podman/containers/create.go b/cmd/podman/containers/create.go index a6550f37f2..e25c90f4d7 100644 --- a/cmd/podman/containers/create.go +++ b/cmd/podman/containers/create.go @@ -257,7 +257,7 @@ func CreateInit(c *cobra.Command, vals entities.ContainerCreateOptions, isInfra if registry.IsRemote() { return vals, errors.New("the '--group-add keep-groups' option is not supported in remote mode") } - vals.Annotation = append(vals.Annotation, "run.oci.keep_original_groups=1") + vals.Annotation = append(vals.Annotation, fmt.Sprintf("%s=1", define.RunOCIKeepOriginalGroups)) } else { groups = append(groups, g) } diff --git a/docs/source/markdown/options/security-opt.md b/docs/source/markdown/options/security-opt.md index 252bede2ff..8ac58b124a 100644 --- a/docs/source/markdown/options/security-opt.md +++ b/docs/source/markdown/options/security-opt.md @@ -18,6 +18,8 @@ Security Options Note: Labeling can be disabled for all <<|pods/>>containers by setting label=false in the **containers.conf** (`/etc/containers/containers.conf` or `$HOME/.config/containers/containers.conf`) file. +- **label=nested**: Allows SELinux modifications within the container. Containers are allowed to modify SELinux labels on files and processes, as long as SELinux policy allows. Without **nested**, containers view SELinux as disabled, even when it is enabled on the host. Containers are prevented from setting any labels. + - **mask**=_/path/1:/path/2_: The paths to mask separated by a colon. A masked path cannot be accessed inside the container<>. - **no-new-privileges**: Disable container processes from gaining additional privileges. diff --git a/libpod/container_config.go b/libpod/container_config.go index 81c912aabb..6aabc817ac 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -219,6 +219,8 @@ type ContainerSecurityConfig struct { // Libpod - mostly used in rootless containers where the user running // Libpod wants to retain their UID inside the container. AddCurrentUserPasswdEntry bool `json:"addCurrentUserPasswdEntry,omitempty"` + // LabelNested, allow labeling separation from within a container + LabelNested bool `json:"label_nested"` } // ContainerNameSpaceConfig is an embedded sub-config providing diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 6ca63f9e20..d2d0e953ab 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -32,8 +32,13 @@ var ( ) func (c *Container) mountSHM(shmOptions string) error { + contextType := "context" + if c.config.LabelNested { + contextType = "rootcontext" + } + if err := unix.Mount("shm", c.config.ShmDir, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, - label.FormatMountLabel(shmOptions, c.config.MountLabel)); err != nil { + label.FormatMountLabelByType(shmOptions, c.config.MountLabel, contextType)); err != nil { return fmt.Errorf("failed to mount shm tmpfs %q: %w", c.config.ShmDir, err) } return nil diff --git a/libpod/define/annotations.go b/libpod/define/annotations.go index a70f83f785..72b5b18aa2 100644 --- a/libpod/define/annotations.go +++ b/libpod/define/annotations.go @@ -1,6 +1,12 @@ package define const ( + // RunOCIMountContextType tells the OCI runtime which context mount + // type to use. context, rootcontext, fscontext, defcontext + RunOCIMountContextType = "run.oci.mount_context_type" + // RunOCIKeepOriginalGroups tells the OCI runtime to leak the users + // current groups into the container + RunOCIKeepOriginalGroups = "run.oci.keep_original_groups" // InspectAnnotationCIDFile is used by Inspect to determine if a // container ID file was created for the container. // If an annotation with this key is found in the OCI spec, it will be @@ -58,7 +64,6 @@ const ( // If an annotation with this key is found in the OCI spec, it will be // used in the output of Inspect(). InspectAnnotationApparmor = "io.podman.annotations.apparmor" - // InspectResponseTrue is a boolean True response for an inspect // annotation. InspectResponseTrue = "TRUE" diff --git a/libpod/options.go b/libpod/options.go index 13ee549478..bc70e4a32c 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -2341,3 +2341,16 @@ func WithMountAllDevices() CtrCreateOption { return nil } } + +// WithLabelNested sets the LabelNested flag allowing label separation within container +func WithLabelNested(nested bool) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + + ctr.config.LabelNested = nested + + return nil + } +} diff --git a/libpod/runtime_volume_common.go b/libpod/runtime_volume_common.go index 54fc158be0..81fb00f32d 100644 --- a/libpod/runtime_volume_common.go +++ b/libpod/runtime_volume_common.go @@ -120,15 +120,13 @@ func (r *Runtime) newVolume(ctx context.Context, noCreatePluginVolume bool, opti volume.config.StorageImageID = image.ID() // Create a backing container in c/storage. - storageConfig := storage.ContainerOptions{ - LabelOpts: []string{"filetype:container_file_t:s0"}, - } + storageConfig := storage.ContainerOptions{} if len(volume.config.MountLabel) > 0 { context, err := selinux.NewContext(volume.config.MountLabel) if err != nil { return nil, fmt.Errorf("failed to get SELinux context from %s: %w", volume.config.MountLabel, err) } - storageConfig.LabelOpts = []string{fmt.Sprintf("filetype:%s:s0", context["type"])} + storageConfig.LabelOpts = []string{fmt.Sprintf("filetype:%s", context["type"])} } if _, err := r.storageService.CreateContainerStorage(ctx, r.imageContext, imgString, image.ID(), volume.config.StorageName, volume.config.StorageID, storageConfig); err != nil { return nil, fmt.Errorf("creating backing storage for image driver: %w", err) diff --git a/pkg/specgen/generate/container_create.go b/pkg/specgen/generate/container_create.go index 55483b8a26..d73abced7e 100644 --- a/pkg/specgen/generate/container_create.go +++ b/pkg/specgen/generate/container_create.go @@ -482,6 +482,9 @@ func createContainerOptions(rt *libpod.Runtime, s *specgen.SpecGenerator, pod *l options = append(options, libpod.WithLogDriver(s.LogConfiguration.Driver)) } } + if s.ContainerSecurityConfig.LabelNested { + options = append(options, libpod.WithLabelNested(s.ContainerSecurityConfig.LabelNested)) + } // Security options if len(s.SelinuxOpts) > 0 { options = append(options, libpod.WithSecLabels(s.SelinuxOpts)) diff --git a/pkg/specgen/specgen.go b/pkg/specgen/specgen.go index c62f274689..ff91489dbf 100644 --- a/pkg/specgen/specgen.go +++ b/pkg/specgen/specgen.go @@ -396,6 +396,10 @@ type ContainerSecurityConfig struct { // mount temporary file systems ReadWriteTmpfs bool `json:"read_write_tmpfs,omitempty"` + // LabelNested indicates whether or not the container is allowed to + // run fully nested containers including labelling + LabelNested bool `json:"label_nested,omitempty"` + // Umask is the umask the init process of the container will be run with. Umask string `json:"umask,omitempty"` // ProcOpts are the options used for the proc mount. diff --git a/pkg/specgenutil/specgen.go b/pkg/specgenutil/specgen.go index 6cfc14d9fd..4bf889fec9 100644 --- a/pkg/specgenutil/specgen.go +++ b/pkg/specgenutil/specgen.go @@ -620,53 +620,57 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions } for _, opt := range c.SecurityOpt { - if opt == "no-new-privileges" { - s.ContainerSecurityConfig.NoNewPrivileges = true + // Docker deprecated the ":" syntax but still supports it, + // so we need to as well + var con []string + if strings.Contains(opt, "=") { + con = strings.SplitN(opt, "=", 2) } else { - // Docker deprecated the ":" syntax but still supports it, - // so we need to as well - var con []string - if strings.Contains(opt, "=") { - con = strings.SplitN(opt, "=", 2) - } else { - con = strings.SplitN(opt, ":", 2) + con = strings.SplitN(opt, ":", 2) + } + if len(con) != 2 && + con[0] != "no-new-privileges" { + return fmt.Errorf("invalid --security-opt 1: %q", opt) + } + switch con[0] { + case "apparmor": + s.ContainerSecurityConfig.ApparmorProfile = con[1] + s.Annotations[define.InspectAnnotationApparmor] = con[1] + case "label": + if con[1] == "nested" { + s.ContainerSecurityConfig.LabelNested = true + continue } - if len(con) != 2 { - return fmt.Errorf("invalid --security-opt 1: %q", opt) - } - switch con[0] { - case "apparmor": - s.ContainerSecurityConfig.ApparmorProfile = con[1] - s.Annotations[define.InspectAnnotationApparmor] = con[1] - case "label": - // TODO selinux opts and label opts are the same thing - s.ContainerSecurityConfig.SelinuxOpts = append(s.ContainerSecurityConfig.SelinuxOpts, con[1]) - s.Annotations[define.InspectAnnotationLabel] = strings.Join(s.ContainerSecurityConfig.SelinuxOpts, ",label=") - case "mask": - s.ContainerSecurityConfig.Mask = append(s.ContainerSecurityConfig.Mask, strings.Split(con[1], ":")...) - case "proc-opts": - s.ProcOpts = strings.Split(con[1], ",") - case "seccomp": - s.SeccompProfilePath = con[1] - s.Annotations[define.InspectAnnotationSeccomp] = con[1] + // TODO selinux opts and label opts are the same thing + s.ContainerSecurityConfig.SelinuxOpts = append(s.ContainerSecurityConfig.SelinuxOpts, con[1]) + s.Annotations[define.InspectAnnotationLabel] = strings.Join(s.ContainerSecurityConfig.SelinuxOpts, ",label=") + case "mask": + s.ContainerSecurityConfig.Mask = append(s.ContainerSecurityConfig.Mask, strings.Split(con[1], ":")...) + case "proc-opts": + s.ProcOpts = strings.Split(con[1], ",") + case "seccomp": + s.SeccompProfilePath = con[1] + s.Annotations[define.InspectAnnotationSeccomp] = con[1] // this option is for docker compatibility, it is the same as unmask=ALL - case "systempaths": - if con[1] == "unconfined" { - s.ContainerSecurityConfig.Unmask = append(s.ContainerSecurityConfig.Unmask, []string{"ALL"}...) - } else { - return fmt.Errorf("invalid systempaths option %q, only `unconfined` is supported", con[1]) - } - case "unmask": - s.ContainerSecurityConfig.Unmask = append(s.ContainerSecurityConfig.Unmask, con[1:]...) - case "no-new-privileges": - noNewPrivileges, err := strconv.ParseBool(con[1]) + case "systempaths": + if con[1] == "unconfined" { + s.ContainerSecurityConfig.Unmask = append(s.ContainerSecurityConfig.Unmask, []string{"ALL"}...) + } else { + return fmt.Errorf("invalid systempaths option %q, only `unconfined` is supported", con[1]) + } + case "unmask": + s.ContainerSecurityConfig.Unmask = append(s.ContainerSecurityConfig.Unmask, con[1:]...) + case "no-new-privileges": + noNewPrivileges := true + if len(con) == 2 { + noNewPrivileges, err = strconv.ParseBool(con[1]) if err != nil { return fmt.Errorf("invalid --security-opt 2: %q", opt) } - s.ContainerSecurityConfig.NoNewPrivileges = noNewPrivileges - default: - return fmt.Errorf("invalid --security-opt 2: %q", opt) } + s.ContainerSecurityConfig.NoNewPrivileges = noNewPrivileges + default: + return fmt.Errorf("invalid --security-opt 2: %q", opt) } } @@ -690,6 +694,17 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions if len(s.Volumes) == 0 || len(c.Volume) != 0 { s.Volumes = volumes } + + if s.ContainerSecurityConfig.LabelNested { + // Need to unmask the SELinux file system + s.Unmask = append(s.Unmask, "/sys/fs/selinux", "/proc") + s.Mounts = append(s.Mounts, specs.Mount{ + Source: "/sys/fs/selinux", + Destination: "/sys/fs/selinux", + Type: define.TypeBind, + }) + s.Annotations[define.RunOCIMountContextType] = "rootcontext" + } // TODO make sure these work in clone if len(s.OverlayVolumes) == 0 { s.OverlayVolumes = overlayVolumes diff --git a/test/e2e/containers_conf_test.go b/test/e2e/containers_conf_test.go index 399824badd..90c4c7b42f 100644 --- a/test/e2e/containers_conf_test.go +++ b/test/e2e/containers_conf_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "strings" + "github.com/containers/podman/v4/libpod/define" . "github.com/containers/podman/v4/test/utils" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -111,7 +112,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { result := podmanTest.Podman([]string{"top", "test1", "capeff"}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(0)) - Expect(result.Out.Contents()).To( + Expect(result.OutputToString()).To( And( ContainSubstring("FOWNER"), ContainSubstring("SETFCAP"), @@ -128,7 +129,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { result := podmanTest.Podman([]string{"container", "top", "test1", "capeff"}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(0)) - Expect(result.Out.Contents()).ToNot( + Expect(result.OutputToString()).ToNot( And( ContainSubstring("SETUID"), ContainSubstring("FOWNER"), @@ -266,7 +267,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) Expect(session.OutputToStringArray()).To(ContainElement(HavePrefix("search"))) - Expect(session.Out.Contents()).To( + Expect(session.OutputToString()).To( And( ContainSubstring("foobar.com"), ContainSubstring("1.2.3.4"), @@ -322,7 +323,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) Expect(session.OutputToStringArray()).To(ContainElement(HavePrefix("search"))) - Expect(session.Out.Contents()).To( + Expect(session.OutputToString()).To( And( ContainSubstring("foobar.com"), ContainSubstring("1.2.3.4"), @@ -333,26 +334,26 @@ var _ = Describe("Verify podman containers.conf usage", func() { session = podmanTest.Podman([]string{"run", "--rm", ALPINE, "cat", "/proc/sys/net/ipv4/ping_group_range"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To(ContainSubstring("1000")) + Expect(session.OutputToString()).To(ContainSubstring("1000")) // shm-size session = podmanTest.Podman([]string{"run", ALPINE, "grep", "shm", "/proc/self/mounts"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To(ContainSubstring("size=200k")) + Expect(session.OutputToString()).To(ContainSubstring("size=200k")) // ulimits session = podmanTest.Podman([]string{"run", "--rm", fedoraMinimal, "ulimit", "-n"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To(ContainSubstring("500")) + Expect(session.OutputToString()).To(ContainSubstring("500")) // Configuration that comes from remote client // Timezone session = podmanTest.Podman([]string{"run", ALPINE, "date", "+'%H %Z'"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To( + Expect(session.OutputToString()).To( Or( ContainSubstring("EST"), ContainSubstring("EDT"), @@ -366,21 +367,21 @@ var _ = Describe("Verify podman containers.conf usage", func() { }) It("add annotations", func() { - // containers.conf is set to "run.oci.keep_original_groups=1" + // containers.conf is set to "run.oci.keep_original_groups=1" session := podmanTest.Podman([]string{"create", "--rm", "--name", "test", fedoraMinimal}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) inspect := podmanTest.Podman([]string{"inspect", "--format", "{{ .Config.Annotations }}", "test"}) inspect.WaitWithDefaultTimeout() - Expect(inspect.Out.Contents()).To(ContainSubstring("run.oci.keep_original_groups:1")) + Expect(inspect.OutputToString()).To(ContainSubstring(fmt.Sprintf("%s:1", define.RunOCIKeepOriginalGroups))) }) It("--add-host and no-hosts=true fails", func() { session := podmanTest.Podman([]string{"run", "-dt", "--add-host", "test1:127.0.0.1", ALPINE, "top"}) session.WaitWithDefaultTimeout() Expect(session).To(ExitWithError()) - Expect(session.Err.Contents()).To(ContainSubstring("--no-hosts and --add-host cannot be set together")) + Expect(session.ErrorToString()).To(ContainSubstring("--no-hosts and --add-host cannot be set together")) session = podmanTest.Podman([]string{"run", "-dt", "--add-host", "test1:127.0.0.1", "--no-hosts=false", ALPINE, "top"}) session.WaitWithDefaultTimeout() @@ -391,12 +392,12 @@ var _ = Describe("Verify podman containers.conf usage", func() { session := podmanTest.Podman([]string{"run", "--rm", "--name", "test", ALPINE, "cat", "/etc/hosts"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).ToNot(ContainSubstring("test")) + Expect(session.OutputToString()).ToNot(ContainSubstring("test")) session = podmanTest.Podman([]string{"run", "--rm", "--name", "test", "--no-hosts=false", ALPINE, "cat", "/etc/hosts"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To(ContainSubstring("test")) + Expect(session.OutputToString()).To(ContainSubstring("test")) }) It("seccomp profile path", func() { @@ -462,7 +463,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { session = podmanTest.Podman([]string{"info", "--format", "{{.Store.ImageCopyTmpDir}}"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(0)) - Expect(session.Out.Contents()).To(ContainSubstring(storagePath)) + Expect(session.OutputToString()).To(ContainSubstring(storagePath)) containersConf = []byte("[engine]\nimage_copy_tmp_dir=\"storage1\"") err = os.WriteFile(configPath, containersConf, os.ModePerm) @@ -472,7 +473,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { session = podmanTest.Podman([]string{"info", "--format", "{{.Store.ImageCopyTmpDir}}"}) session.WaitWithDefaultTimeout() Expect(session).Should(Exit(125)) - Expect(session.Err.Contents()).To(ContainSubstring("invalid image_copy_tmp_dir value \"storage1\" (relative paths are not accepted)")) + Expect(session.ErrorToString()).To(ContainSubstring("invalid image_copy_tmp_dir value \"storage1\" (relative paths are not accepted)")) os.Setenv("TMPDIR", "/hoge") session = podmanTest.Podman([]string{"info", "--format", "{{.Store.ImageCopyTmpDir}}"}) @@ -490,7 +491,7 @@ var _ = Describe("Verify podman containers.conf usage", func() { result := podmanTest.Podman([]string{"system", "service", "--help"}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(0)) - Expect(result.Out.Contents()).To(ContainSubstring("(default 1234)")) + Expect(result.OutputToString()).To(ContainSubstring("(default 1234)")) }) It("bad infra_image name", func() { @@ -512,17 +513,17 @@ var _ = Describe("Verify podman containers.conf usage", func() { result := podmanTest.Podman([]string{"pod", "create", "--infra-image", infra2}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(125)) - Expect(result.Err.Contents()).To(ContainSubstring(error2String)) + Expect(result.ErrorToString()).To(ContainSubstring(error2String)) result = podmanTest.Podman([]string{"pod", "create"}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(125)) - Expect(result.Err.Contents()).To(ContainSubstring(errorString)) + Expect(result.ErrorToString()).To(ContainSubstring(errorString)) result = podmanTest.Podman([]string{"create", "--pod", "new:pod1", ALPINE}) result.WaitWithDefaultTimeout() Expect(result).Should(Exit(125)) - Expect(result.Err.Contents()).To(ContainSubstring(errorString)) + Expect(result.ErrorToString()).To(ContainSubstring(errorString)) }) It("set .engine.remote=true", func() { diff --git a/test/system/410-selinux.bats b/test/system/410-selinux.bats index d23e687b29..2347fcc447 100644 --- a/test/system/410-selinux.bats +++ b/test/system/410-selinux.bats @@ -277,4 +277,20 @@ function check_label() { is "$output" "${RELABEL} $tmpdir" "Shared Relabel Correctly" } +@test "podman selinux nested" { + skip_if_no_selinux + + ROOTCONTEXT='rw,rootcontext="system_u:object_r:container_file_t:s0:c1,c2"' + SELINUXMNT="selinuxfs.*(rw,nosuid,noexec,relatime)" + + SELINUXMNT="tmpfs.*selinux.*\(ro" + run_podman run --rm --security-opt label=level:s0:c1,c2 $IMAGE mount + assert "$output" !~ "${ROOTCONTEXT}" "Don't use rootcontext" + assert "$output" =~ "${SELINUXMNT}" "Mount SELinux file system readwrite" + + run_podman run --rm --security-opt label=nested --security-opt label=level:s0:c1,c2 $IMAGE mount + assert "$output" =~ "${ROOTCONTEXT}" "Uses rootcontext" + assert "$output" =~ "${SELINUXMNT}" "Mount SELinux file system readwrite" +} + # vim: filetype=sh