add new pkg/systemd to contain podman systemd cgroup code

In podman we have code to move a process into a new systemd cgroup. This
code lived in the podman utils package. Because the new rootlessnetns
must call into that move this code to c/common.

Instead of dumping this again into a "util" package create a systemd
package which should have a better name. Also move the cgroup code
directly into pkg/cgroup. I am sure we can do some cleanup there in a
followup to prevent duplication.

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger 2023-11-27 13:06:59 +01:00
parent 45b61c15d1
commit 8b81a2471f
7 changed files with 355 additions and 9 deletions

View File

@ -15,6 +15,7 @@ import (
"github.com/containers/common/libnetwork/slirp4netns"
"github.com/containers/common/pkg/config"
"github.com/containers/common/pkg/netns"
"github.com/containers/common/pkg/systemd"
"github.com/containers/storage/pkg/homedir"
"github.com/containers/storage/pkg/lockfile"
"github.com/hashicorp/go-multierror"
@ -163,15 +164,14 @@ func (n *Netns) setupSlirp4netns(nsPath string) error {
return wrapError("write slirp4netns pid file", err)
}
// FIXME
// if utils.RunsOnSystemd() {
// // move to systemd scope to prevent systemd from killing it
// err = utils.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid)
// if err != nil {
// // only log this, it is not fatal but can lead to issues when running podman inside systemd units
// logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err)
// }
// }
if systemd.RunsOnSystemd() {
// move to systemd scope to prevent systemd from killing it
err = systemd.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid)
if err != nil {
// only log this, it is not fatal but can lead to issues when running podman inside systemd units
logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err)
}
}
// build a new resolv.conf file which uses the slirp4netns dns server address
resolveIP, err := slirp4netns.GetDNS(res.Subnet)

View File

View File

@ -0,0 +1 @@
0::/other

View File

@ -0,0 +1 @@
0::/

View File

@ -4,6 +4,7 @@
package cgroups
import (
"bufio"
"bytes"
"errors"
"fmt"
@ -11,6 +12,7 @@ import (
"path"
"path/filepath"
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@ -143,3 +145,171 @@ func SetBlkioThrottle(res *configs.Resources, cgroupPath string) error {
}
return nil
}
// Code below was moved from podman/utils/utils_supported.go and should properly better
// integrated here as some parts may be redundant.
func getCgroupProcess(procFile string, allowRoot bool) (string, error) {
f, err := os.Open(procFile)
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
cgroup := ""
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 3)
if len(parts) != 3 {
return "", fmt.Errorf("cannot parse cgroup line %q", line)
}
if strings.HasPrefix(line, "0::") {
cgroup = line[3:]
break
}
if len(parts[2]) > len(cgroup) {
cgroup = parts[2]
}
}
if len(cgroup) == 0 || (!allowRoot && cgroup == "/") {
return "", fmt.Errorf("could not find cgroup mount in %q", procFile)
}
return cgroup, nil
}
// GetOwnCgroup returns the cgroup for the current process.
func GetOwnCgroup() (string, error) {
return getCgroupProcess("/proc/self/cgroup", true)
}
func GetOwnCgroupDisallowRoot() (string, error) {
return getCgroupProcess("/proc/self/cgroup", false)
}
// GetCgroupProcess returns the cgroup for the specified process process.
func GetCgroupProcess(pid int) (string, error) {
return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid), true)
}
// MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
func MoveUnderCgroupSubtree(subtree string) error {
return MoveUnderCgroup("", subtree, nil)
}
// MoveUnderCgroup moves a group of processes to a new cgroup.
// If cgroup is the empty string, then the current calling process cgroup is used.
// If processes is empty, then the processes from the current cgroup are moved.
func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error {
procFile := "/proc/self/cgroup"
f, err := os.Open(procFile)
if err != nil {
return err
}
defer f.Close()
unifiedMode, err := IsCgroup2UnifiedMode()
if err != nil {
return err
}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 3)
if len(parts) != 3 {
return fmt.Errorf("cannot parse cgroup line %q", line)
}
// root cgroup, skip it
if parts[2] == "/" && !(unifiedMode && parts[1] == "") {
continue
}
cgroupRoot := "/sys/fs/cgroup"
// Special case the unified mount on hybrid cgroup and named hierarchies.
// This works on Fedora 31, but we should really parse the mounts to see
// where the cgroup hierarchy is mounted.
if parts[1] == "" && !unifiedMode {
// If it is not using unified mode, the cgroup v2 hierarchy is
// usually mounted under /sys/fs/cgroup/unified
cgroupRoot = filepath.Join(cgroupRoot, "unified")
// Ignore the unified mount if it doesn't exist
if _, err := os.Stat(cgroupRoot); err != nil && os.IsNotExist(err) {
continue
}
} else if parts[1] != "" {
// Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER.
controller := strings.TrimPrefix(parts[1], "name=")
cgroupRoot = filepath.Join(cgroupRoot, controller)
}
parentCgroup := cgroup
if parentCgroup == "" {
parentCgroup = parts[2]
}
newCgroup := filepath.Join(cgroupRoot, parentCgroup, subtree)
if err := os.MkdirAll(newCgroup, 0o755); err != nil && !os.IsExist(err) {
return err
}
f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0o755)
if err != nil {
return err
}
defer f.Close()
if len(processes) > 0 {
for _, pid := range processes {
if _, err := f.WriteString(fmt.Sprintf("%d\n", pid)); err != nil {
logrus.Debugf("Cannot move process %d to cgroup %q: %v", pid, newCgroup, err)
}
}
} else {
processesData, err := os.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs"))
if err != nil {
return err
}
for _, pid := range bytes.Split(processesData, []byte("\n")) {
if len(pid) == 0 {
continue
}
if _, err := f.Write(pid); err != nil {
logrus.Debugf("Cannot move process %s to cgroup %q: %v", string(pid), newCgroup, err)
}
}
}
}
return nil
}
var (
maybeMoveToSubCgroupSync sync.Once
maybeMoveToSubCgroupSyncErr error
)
// MaybeMoveToSubCgroup moves the current process in a sub cgroup when
// it is running in the root cgroup on a system that uses cgroupv2.
func MaybeMoveToSubCgroup() error {
maybeMoveToSubCgroupSync.Do(func() {
unifiedMode, err := IsCgroup2UnifiedMode()
if err != nil {
maybeMoveToSubCgroupSyncErr = err
return
}
if !unifiedMode {
maybeMoveToSubCgroupSyncErr = nil
return
}
cgroup, err := GetOwnCgroup()
if err != nil {
maybeMoveToSubCgroupSyncErr = err
return
}
if cgroup == "/" {
maybeMoveToSubCgroupSyncErr = MoveUnderCgroupSubtree("init")
}
})
return maybeMoveToSubCgroupSyncErr
}

View File

@ -0,0 +1,23 @@
package cgroups
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestCgroupProcess(t *testing.T) {
val, err := getCgroupProcess("testdata/cgroup.root", true)
assert.Nil(t, err)
assert.Equal(t, "/", val)
_, err = getCgroupProcess("testdata/cgroup.root", false)
assert.NotNil(t, err)
val, err = getCgroupProcess("testdata/cgroup.other", true)
assert.Nil(t, err)
assert.Equal(t, "/other", val)
_, err = getCgroupProcess("testdata/cgroup.empty", true)
assert.NotNil(t, err)
}

View File

@ -0,0 +1,151 @@
package systemd
import (
"context"
"crypto/rand"
"fmt"
"os"
"strconv"
"sync"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/storage/pkg/unshare"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
)
var (
runsOnSystemdOnce sync.Once
runsOnSystemd bool
)
// RunsOnSystemd returns whether the system is using systemd
func RunsOnSystemd() bool {
runsOnSystemdOnce.Do(func() {
// per sd_booted(3), check for this dir
fd, err := os.Stat("/run/systemd/system")
runsOnSystemd = err == nil && fd.IsDir()
})
return runsOnSystemd
}
func moveProcessPIDFileToScope(pidPath, slice, scope string) error {
data, err := os.ReadFile(pidPath)
if err != nil {
// do not raise an error if the file doesn't exist
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("cannot read pid file: %w", err)
}
pid, err := strconv.ParseUint(string(data), 10, 0)
if err != nil {
return fmt.Errorf("cannot parse pid file %s: %w", pidPath, err)
}
return moveProcessToScope(int(pid), slice, scope)
}
func moveProcessToScope(pid int, slice, scope string) error {
err := RunUnderSystemdScope(pid, slice, scope)
// If the PID is not valid anymore, do not return an error.
if dbusErr, ok := err.(dbus.Error); ok {
if dbusErr.Name == "org.freedesktop.DBus.Error.UnixProcessIdUnknown" {
return nil
}
}
return err
}
// MoveRootlessNetnsSlirpProcessToUserSlice moves the slirp4netns process for the rootless netns
// into a different scope so that systemd does not kill it with a container.
func MoveRootlessNetnsSlirpProcessToUserSlice(pid int) error {
randBytes := make([]byte, 4)
_, err := rand.Read(randBytes)
if err != nil {
return err
}
return moveProcessToScope(pid, "user.slice", fmt.Sprintf("rootless-netns-%x.scope", randBytes))
}
// MovePauseProcessToScope moves the pause process used for rootless mode to keep the namespaces alive to
// a separate scope.
func MovePauseProcessToScope(pausePidPath string) {
var err error
for i := 0; i < 10; i++ {
randBytes := make([]byte, 4)
_, err = rand.Read(randBytes)
if err != nil {
logrus.Errorf("failed to read random bytes: %v", err)
continue
}
err = moveProcessPIDFileToScope(pausePidPath, "user.slice", fmt.Sprintf("podman-pause-%x.scope", randBytes))
if err == nil {
return
}
}
if err != nil {
unified, err2 := cgroups.IsCgroup2UnifiedMode()
if err2 != nil {
logrus.Warnf("Failed to detect if running with cgroup unified: %v", err)
}
if RunsOnSystemd() && unified {
logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err)
} else {
logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err)
}
}
}
// RunUnderSystemdScope adds the specified pid to a systemd scope
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
var properties []systemdDbus.Property
var conn *systemdDbus.Conn
var err error
if unshare.GetRootlessUID() != 0 {
conn, err = cgroups.UserConnection(unshare.GetRootlessUID())
if err != nil {
return err
}
} else {
conn, err = systemdDbus.NewWithContext(context.Background())
if err != nil {
return err
}
}
defer conn.Close()
properties = append(properties, systemdDbus.PropSlice(slice))
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
properties = append(properties, newProp("Delegate", true))
properties = append(properties, newProp("DefaultDependencies", false))
ch := make(chan string)
_, err = conn.StartTransientUnitContext(context.Background(), unitName, "replace", properties, ch)
if err != nil {
// On errors check if the cgroup already exists, if it does move the process there
if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil {
if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" {
if err := cgroups.MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil {
return nil
}
// On errors return the original error message we got from StartTransientUnit.
}
}
return err
}
// Block until job is started
<-ch
return nil
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}