From bebf55c0f22d6723a27cd39561c0577aa557c5e1 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:35:19 +0100 Subject: [PATCH 1/8] libpod: Move oci_conmon_linux.go to oci_conmon_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/{oci_conmon_linux.go => oci_conmon_common.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libpod/{oci_conmon_linux.go => oci_conmon_common.go} (100%) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_common.go similarity index 100% rename from libpod/oci_conmon_linux.go rename to libpod/oci_conmon_common.go From 8d229c6cdc9ab7325bf1f246e1bab6af79e75afe Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:36:46 +0100 Subject: [PATCH 2/8] libpod: Move oci_conmon_attach_linux.go to oci_conmon_attach_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- .../{oci_conmon_attach_linux.go => oci_conmon_attach_common.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libpod/{oci_conmon_attach_linux.go => oci_conmon_attach_common.go} (100%) diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_common.go similarity index 100% rename from libpod/oci_conmon_attach_linux.go rename to libpod/oci_conmon_attach_common.go From 68b2450d3de0344b2a4cfacdcabed8d1c854cb68 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:37:07 +0100 Subject: [PATCH 3/8] libpod: Move oci_conmon_exec_linux.go to oci_conmon_exec_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/{oci_conmon_exec_linux.go => oci_conmon_exec_common.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libpod/{oci_conmon_exec_linux.go => oci_conmon_exec_common.go} (100%) diff --git a/libpod/oci_conmon_exec_linux.go b/libpod/oci_conmon_exec_common.go similarity index 100% rename from libpod/oci_conmon_exec_linux.go rename to libpod/oci_conmon_exec_common.go From 6791cdbdf153a0b3103810679995cc09ea8db340 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 10:29:40 +0100 Subject: [PATCH 4/8] libpod: Move rootless handling from oci_conmon_common.go to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 56 +---------------------------- libpod/oci_conmon_linux.go | 70 +++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 libpod/oci_conmon_linux.go diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 1b654ed33c..4ca2d6e341 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -41,7 +41,6 @@ import ( "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/homedir" - pmount "github.com/containers/storage/pkg/mount" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" @@ -204,60 +203,7 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. if !ctr.config.Privileged && !rootless.IsRootless() { - type result struct { - restoreDuration int64 - err error - } - ch := make(chan result) - go func() { - runtime.LockOSThread() - restoreDuration, err := func() (int64, error) { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return 0, err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return 0, err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("Unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - - mounts, err := pmount.GetMounts() - if err != nil { - return 0, err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- result{ - restoreDuration: restoreDuration, - err: err, - } - }() - r := <-ch - return r.restoreDuration, r.err + return r.createRootlessContainer(ctr, restoreOptions) } } return r.createOCIContainer(ctr, restoreOptions) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go new file mode 100644 index 0000000000..4e8bbafd6d --- /dev/null +++ b/libpod/oci_conmon_linux.go @@ -0,0 +1,70 @@ +package libpod + +import ( + "fmt" + "os" + "runtime" + "strings" + + "github.com/containers/podman/v4/pkg/errorhandling" + pmount "github.com/containers/storage/pkg/mount" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + type result struct { + restoreDuration int64 + err error + } + ch := make(chan result) + go func() { + runtime.LockOSThread() + restoreDuration, err := func() (int64, error) { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return 0, err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return 0, err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("Unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + + mounts, err := pmount.GetMounts() + if err != nil { + return 0, err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- result{ + restoreDuration: restoreDuration, + err: err, + } + }() + res := <-ch + return res.restoreDuration, res.err +} From 93bad904864aa71c45b6b72d217a752c05eb254b Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 10:30:30 +0100 Subject: [PATCH 5/8] libpod: Move socket label handling from oci_conmon_common.go to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 24 +++++------------------- libpod/oci_conmon_linux.go | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 4ca2d6e341..aee0c36c80 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -16,7 +16,6 @@ import ( "os" "os/exec" "path/filepath" - "runtime" "strconv" "strings" "sync" @@ -42,7 +41,6 @@ import ( "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/homedir" spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -763,23 +761,11 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container env = append(env, fmt.Sprintf("PATH=%s", path)) } - runtime.LockOSThread() - if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return 0, err - } - - runtimeCheckpointStarted := time.Now() - err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) - // Ignore error returned from SetSocketLabel("") call, - // can't recover. - if labelErr := label.SetSocketLabel(""); labelErr == nil { - // Unlock the thread only if the process label could be restored - // successfully. Otherwise leave the thread locked and the Go runtime - // will terminate it once it returns to the threads pool. - runtime.UnlockOSThread() - } else { - logrus.Errorf("Unable to reset socket label: %q", labelErr) - } + var runtimeCheckpointStarted time.Time + err = r.withContainerSocketLabel(ctr, func() error { + runtimeCheckpointStarted = time.Now() + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + }) runtimeCheckpointDuration := func() int64 { if options.PrintStats { diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 4e8bbafd6d..ce6eaf32a7 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -8,6 +8,7 @@ import ( "github.com/containers/podman/v4/pkg/errorhandling" pmount "github.com/containers/storage/pkg/mount" + "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -68,3 +69,23 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption res := <-ch return res.restoreDuration, res.err } + +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { + runtime.LockOSThread() + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return err + } + err := closure() + // Ignore error returned from SetSocketLabel("") call, + // can't recover. + if labelErr := label.SetSocketLabel(""); labelErr == nil { + // Unlock the thread only if the process label could be restored + // successfully. Otherwise leave the thread locked and the Go runtime + // will terminate it once it returns to the threads pool. + runtime.UnlockOSThread() + } else { + logrus.Errorf("Unable to reset socket label: %q", labelErr) + } + return err +} From d43fac20f3025096cdfe45ae32f41886b39e4659 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 11:15:37 +0100 Subject: [PATCH 6/8] libpod: Move moveConmonToCgroupAndSignal and GetLimits to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 261 ----------------------------------- libpod/oci_conmon_linux.go | 267 ++++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 261 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index aee0c36c80..222fec9ca4 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -23,10 +23,6 @@ import ( "text/template" "time" - runcconfig "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - - "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/resize" cutil "github.com/containers/common/pkg/util" @@ -1338,75 +1334,6 @@ func startCommand(cmd *exec.Cmd, ctr *Container) error { return cmd.Start() } -// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup -// it then signals for conmon to start by sending nonce data down the start fd -func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { - mustCreateCgroup := true - - if ctr.config.NoCgroups { - mustCreateCgroup = false - } - - // If cgroup creation is disabled - just signal. - switch ctr.config.CgroupsMode { - case "disabled", "no-conmon", cgroupSplit: - mustCreateCgroup = false - } - - // $INVOCATION_ID is set by systemd when running as a service. - if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { - mustCreateCgroup = false - } - - if mustCreateCgroup { - // Usually rootless users are not allowed to configure cgroupfs. - // There are cases though, where it is allowed, e.g. if the cgroup - // is manually configured and chowned). Avoid detecting all - // such cases and simply use a lower log level. - logLevel := logrus.WarnLevel - if rootless.IsRootless() { - logLevel = logrus.InfoLevel - } - // TODO: This should be a switch - we are not guaranteed that - // there are only 2 valid cgroup managers - cgroupParent := ctr.CgroupParent() - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - Resource := ctr.Spec().Linux.Resources - cgroupResources, err := GetLimits(Resource) - if err != nil { - logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") - } - if ctr.CgroupManager() == config.SystemdCgroupsManager { - unitName := createUnitName("libpod-conmon", ctr.ID()) - realCgroupParent := cgroupParent - splitParent := strings.Split(cgroupParent, "/") - if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { - realCgroupParent = splitParent[len(splitParent)-1] - } - - logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) - if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) - } - } else { - control, err := cgroups.New(cgroupPath, &cgroupResources) - if err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } else if err := control.AddPid(cmd.Process.Pid); err != nil { - // we need to remove this defer and delete the cgroup once conmon exits - // maybe need a conmon monitor? - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } - } - } - - /* We set the cgroup, now the child can start creating children */ - if err := writeConmonPipeData(startFd); err != nil { - return err - } - return nil -} - // newPipe creates a unix socket pair for communication. // Returns two files - first is parent, second is child. func newPipe() (*os.File, *os.File, error) { @@ -1671,191 +1598,3 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, } } } - -// GetLimits converts spec resource limits to cgroup consumable limits -func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { - if resource == nil { - resource = &spec.LinuxResources{} - } - final := &runcconfig.Resources{} - devs := []*devices.Rule{} - - // Devices - for _, entry := range resource.Devices { - if entry.Major == nil || entry.Minor == nil { - continue - } - runeType := 'a' - switch entry.Type { - case "b": - runeType = 'b' - case "c": - runeType = 'c' - } - - devs = append(devs, &devices.Rule{ - Type: devices.Type(runeType), - Major: *entry.Major, - Minor: *entry.Minor, - Permissions: devices.Permissions(entry.Access), - Allow: entry.Allow, - }) - } - final.Devices = devs - - // HugepageLimits - pageLimits := []*runcconfig.HugepageLimit{} - for _, entry := range resource.HugepageLimits { - pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ - Pagesize: entry.Pagesize, - Limit: entry.Limit, - }) - } - final.HugetlbLimit = pageLimits - - // Networking - netPriorities := []*runcconfig.IfPrioMap{} - if resource.Network != nil { - for _, entry := range resource.Network.Priorities { - netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ - Interface: entry.Name, - Priority: int64(entry.Priority), - }) - } - } - final.NetPrioIfpriomap = netPriorities - rdma := make(map[string]runcconfig.LinuxRdma) - for name, entry := range resource.Rdma { - rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} - } - final.Rdma = rdma - - // Memory - if resource.Memory != nil { - if resource.Memory.Limit != nil { - final.Memory = *resource.Memory.Limit - } - if resource.Memory.Reservation != nil { - final.MemoryReservation = *resource.Memory.Reservation - } - if resource.Memory.Swap != nil { - final.MemorySwap = *resource.Memory.Swap - } - if resource.Memory.Swappiness != nil { - final.MemorySwappiness = resource.Memory.Swappiness - } - } - - // CPU - if resource.CPU != nil { - if resource.CPU.Period != nil { - final.CpuPeriod = *resource.CPU.Period - } - if resource.CPU.Quota != nil { - final.CpuQuota = *resource.CPU.Quota - } - if resource.CPU.RealtimePeriod != nil { - final.CpuRtPeriod = *resource.CPU.RealtimePeriod - } - if resource.CPU.RealtimeRuntime != nil { - final.CpuRtRuntime = *resource.CPU.RealtimeRuntime - } - if resource.CPU.Shares != nil { - final.CpuShares = *resource.CPU.Shares - } - final.CpusetCpus = resource.CPU.Cpus - final.CpusetMems = resource.CPU.Mems - } - - // BlkIO - if resource.BlockIO != nil { - if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) - } - } - if resource.BlockIO.LeafWeight != nil { - final.BlkioLeafWeight = *resource.BlockIO.LeafWeight - } - if resource.BlockIO.Weight != nil { - final.BlkioWeight = *resource.BlockIO.Weight - } - if len(resource.BlockIO.WeightDevice) > 0 { - for _, entry := range resource.BlockIO.WeightDevice { - weight := &runcconfig.WeightDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - if entry.Weight != nil { - weight.Weight = *entry.Weight - } - if entry.LeafWeight != nil { - weight.LeafWeight = *entry.LeafWeight - } - weight.BlockIODevice = *dev - final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) - } - } - } - - // Pids - if resource.Pids != nil { - final.PidsLimit = resource.Pids.Limit - } - - // Networking - if resource.Network != nil { - if resource.Network.ClassID != nil { - final.NetClsClassid = *resource.Network.ClassID - } - } - - // Unified state - final.Unified = resource.Unified - - return *final, nil -} diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index ce6eaf32a7..0964d4ea37 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -3,11 +3,21 @@ package libpod import ( "fmt" "os" + "os/exec" + "path/filepath" "runtime" "strings" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/utils" pmount "github.com/containers/storage/pkg/mount" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -89,3 +99,260 @@ func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func } return err } + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + mustCreateCgroup := true + + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + // If cgroup creation is disabled - just signal. + switch ctr.config.CgroupsMode { + case "disabled", "no-conmon", cgroupSplit: + mustCreateCgroup = false + } + + // $INVOCATION_ID is set by systemd when running as a service. + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { + mustCreateCgroup = false + } + + if mustCreateCgroup { + // Usually rootless users are not allowed to configure cgroupfs. + // There are cases though, where it is allowed, e.g. if the cgroup + // is manually configured and chowned). Avoid detecting all + // such cases and simply use a lower log level. + logLevel := logrus.WarnLevel + if rootless.IsRootless() { + logLevel = logrus.InfoLevel + } + // TODO: This should be a switch - we are not guaranteed that + // there are only 2 valid cgroup managers + cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } + if ctr.CgroupManager() == config.SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + control, err := cgroups.New(cgroupPath, &cgroupResources) + if err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else if err := control.AddPid(cmd.Process.Pid); err != nil { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} From cb4158889e7a115b4d8bb77c76cc99032d5e8363 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 11:18:15 +0100 Subject: [PATCH 7/8] libpod: Move openUnixSocket to oci_conmon_attach_linux.go This function depends on linux-specific functionality in /proc/fd to allow connecting to local domain sockets with pathnames too long for sockaddr_un. [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_attach_common.go | 9 --------- libpod/oci_conmon_attach_linux.go | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 libpod/oci_conmon_attach_linux.go diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index aa55aa6f53..adc3745030 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -29,15 +29,6 @@ const ( AttachPipeStderr = 3 ) -func openUnixSocket(path string) (*net.UnixConn, error) { - fd, err := unix.Open(path, unix.O_PATH, 0) - if err != nil { - return nil, err - } - defer unix.Close(fd) - return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) -} - // Attach to the given container. // Does not check if state is appropriate. // started is only required if startContainer is true. diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go new file mode 100644 index 0000000000..f1aa89d3e9 --- /dev/null +++ b/libpod/oci_conmon_attach_linux.go @@ -0,0 +1,17 @@ +package libpod + +import ( + "fmt" + "net" + + "golang.org/x/sys/unix" +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + fd, err := unix.Open(path, unix.O_PATH, 0) + if err != nil { + return nil, err + } + defer unix.Close(fd) + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) +} From 054d64710736250c4d238e159884c1588eb7218a Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:43:43 +0100 Subject: [PATCH 8/8] libpod: Build oci_conmon_common.go and oci_conmon_attach_common on FreeBSD This also adds FreeBSD equivalents to the functions moved to oci_conmon*_linux.go. For openUnixSocket, we create a temporary symlink to shorten the path to something that fits into sockaddr_un. [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/networking_unsupported.go | 7 +++++++ libpod/oci_conmon_attach_common.go | 4 ++-- libpod/oci_conmon_attach_freebsd.go | 21 +++++++++++++++++++++ libpod/oci_conmon_common.go | 4 ++-- libpod/oci_conmon_freebsd.go | 24 ++++++++++++++++++++++++ libpod/oci_conmon_unsupported.go | 4 ++-- 6 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 libpod/oci_conmon_attach_freebsd.go create mode 100644 libpod/oci_conmon_freebsd.go diff --git a/libpod/networking_unsupported.go b/libpod/networking_unsupported.go index 227b512cdb..76ffabb5ed 100644 --- a/libpod/networking_unsupported.go +++ b/libpod/networking_unsupported.go @@ -77,3 +77,10 @@ func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { return nil, errors.New("not implemented (*Runtime) GetRootlessNetNs") } + +// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. +// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. +// For machine the HostIP must only be used by gvproxy and never in the VM. +func (c *Container) convertPortMappings() []types.PortMapping { + return []types.PortMapping{} +} diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index adc3745030..a9e9b2bb53 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -1,5 +1,5 @@ -//go:build linux -// +build linux +//go:build linux || freebsd +// +build linux freebsd package libpod diff --git a/libpod/oci_conmon_attach_freebsd.go b/libpod/oci_conmon_attach_freebsd.go new file mode 100644 index 0000000000..de00543814 --- /dev/null +++ b/libpod/oci_conmon_attach_freebsd.go @@ -0,0 +1,21 @@ +package libpod + +import ( + "net" + "os" + "path/filepath" +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + // socket paths can be too long to fit into a sockaddr_un so we create a shorter symlink. + tmpdir, err := os.MkdirTemp("", "podman") + if err != nil { + return nil, err + } + defer os.RemoveAll(tmpdir) + tmpsockpath := filepath.Join(tmpdir, "sock") + if err := os.Symlink(path, tmpsockpath); err != nil { + return nil, err + } + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: tmpsockpath, Net: "unixpacket"}) +} diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 222fec9ca4..c3725cdb46 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -1,5 +1,5 @@ -//go:build linux -// +build linux +//go:build linux || freebsd +// +build linux freebsd package libpod diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go new file mode 100644 index 0000000000..6f7ac7fc68 --- /dev/null +++ b/libpod/oci_conmon_freebsd.go @@ -0,0 +1,24 @@ +package libpod + +import ( + "errors" + "os" + "os/exec" +) + +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") +} + +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { + // No label support yet + return closure() +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + // No equivalent on FreeBSD + return nil +} diff --git a/libpod/oci_conmon_unsupported.go b/libpod/oci_conmon_unsupported.go index c72dc0f0d7..cc6d68e894 100644 --- a/libpod/oci_conmon_unsupported.go +++ b/libpod/oci_conmon_unsupported.go @@ -1,5 +1,5 @@ -//go:build !linux -// +build !linux +//go:build !linux && !freebsd +// +build !linux,!freebsd package libpod