podman/vendor/github.com/containers/common/pkg/unshare/unshare_linux.go

581 lines
18 KiB
Go

// +build linux
package unshare
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"os/exec"
"os/user"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"github.com/containers/storage/pkg/idtools"
"github.com/containers/storage/pkg/reexec"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
)
// Cmd wraps an exec.Cmd created by the reexec package in unshare(), and
// handles setting ID maps and other related settings by triggering
// initialization code in the child.
type Cmd struct {
*exec.Cmd
UnshareFlags int
UseNewuidmap bool
UidMappings []specs.LinuxIDMapping
UseNewgidmap bool
GidMappings []specs.LinuxIDMapping
GidMappingsEnableSetgroups bool
Setsid bool
Setpgrp bool
Ctty *os.File
OOMScoreAdj *int
Hook func(pid int) error
}
// Command creates a new Cmd which can be customized.
func Command(args ...string) *Cmd {
cmd := reexec.Command(args...)
return &Cmd{
Cmd: cmd,
}
}
func (c *Cmd) Start() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
// Set an environment variable to tell the child to synchronize its startup.
if c.Env == nil {
c.Env = os.Environ()
}
c.Env = append(c.Env, fmt.Sprintf("_Containers-unshare=%d", c.UnshareFlags))
// Please the libpod "rootless" package to find the expected env variables.
if os.Geteuid() != 0 {
c.Env = append(c.Env, "_CONTAINERS_USERNS_CONFIGURED=done")
c.Env = append(c.Env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%d", os.Geteuid()))
c.Env = append(c.Env, fmt.Sprintf("_CONTAINERS_ROOTLESS_GID=%d", os.Getegid()))
}
// Create the pipe for reading the child's PID.
pidRead, pidWrite, err := os.Pipe()
if err != nil {
return errors.Wrapf(err, "error creating pid pipe")
}
c.Env = append(c.Env, fmt.Sprintf("_Containers-pid-pipe=%d", len(c.ExtraFiles)+3))
c.ExtraFiles = append(c.ExtraFiles, pidWrite)
// Create the pipe for letting the child know to proceed.
continueRead, continueWrite, err := os.Pipe()
if err != nil {
pidRead.Close()
pidWrite.Close()
return errors.Wrapf(err, "error creating pid pipe")
}
c.Env = append(c.Env, fmt.Sprintf("_Containers-continue-pipe=%d", len(c.ExtraFiles)+3))
c.ExtraFiles = append(c.ExtraFiles, continueRead)
// Pass along other instructions.
if c.Setsid {
c.Env = append(c.Env, "_Containers-setsid=1")
}
if c.Setpgrp {
c.Env = append(c.Env, "_Containers-setpgrp=1")
}
if c.Ctty != nil {
c.Env = append(c.Env, fmt.Sprintf("_Containers-ctty=%d", len(c.ExtraFiles)+3))
c.ExtraFiles = append(c.ExtraFiles, c.Ctty)
}
// Make sure we clean up our pipes.
defer func() {
if pidRead != nil {
pidRead.Close()
}
if pidWrite != nil {
pidWrite.Close()
}
if continueRead != nil {
continueRead.Close()
}
if continueWrite != nil {
continueWrite.Close()
}
}()
// Start the new process.
err = c.Cmd.Start()
if err != nil {
return err
}
// Close the ends of the pipes that the parent doesn't need.
continueRead.Close()
continueRead = nil
pidWrite.Close()
pidWrite = nil
// Read the child's PID from the pipe.
pidString := ""
b := new(bytes.Buffer)
if _, err := io.Copy(b, pidRead); err != nil {
return errors.Wrapf(err, "error reading child PID")
}
pidString = b.String()
pid, err := strconv.Atoi(pidString)
if err != nil {
fmt.Fprintf(continueWrite, "error parsing PID %q: %v", pidString, err)
return errors.Wrapf(err, "error parsing PID %q", pidString)
}
pidString = fmt.Sprintf("%d", pid)
// If we created a new user namespace, set any specified mappings.
if c.UnshareFlags&syscall.CLONE_NEWUSER != 0 {
// Always set "setgroups".
setgroups, err := os.OpenFile(fmt.Sprintf("/proc/%s/setgroups", pidString), os.O_TRUNC|os.O_WRONLY, 0)
if err != nil {
fmt.Fprintf(continueWrite, "error opening setgroups: %v", err)
return errors.Wrapf(err, "error opening /proc/%s/setgroups", pidString)
}
defer setgroups.Close()
if c.GidMappingsEnableSetgroups {
if _, err := fmt.Fprintf(setgroups, "allow"); err != nil {
fmt.Fprintf(continueWrite, "error writing \"allow\" to setgroups: %v", err)
return errors.Wrapf(err, "error opening \"allow\" to /proc/%s/setgroups", pidString)
}
} else {
if _, err := fmt.Fprintf(setgroups, "deny"); err != nil {
fmt.Fprintf(continueWrite, "error writing \"deny\" to setgroups: %v", err)
return errors.Wrapf(err, "error writing \"deny\" to /proc/%s/setgroups", pidString)
}
}
if len(c.UidMappings) == 0 || len(c.GidMappings) == 0 {
uidmap, gidmap, err := GetHostIDMappings("")
if err != nil {
fmt.Fprintf(continueWrite, "error reading ID mappings in parent: %v", err)
return errors.Wrapf(err, "error reading ID mappings in parent")
}
if len(c.UidMappings) == 0 {
c.UidMappings = uidmap
for i := range c.UidMappings {
c.UidMappings[i].HostID = c.UidMappings[i].ContainerID
}
}
if len(c.GidMappings) == 0 {
c.GidMappings = gidmap
for i := range c.GidMappings {
c.GidMappings[i].HostID = c.GidMappings[i].ContainerID
}
}
}
if len(c.GidMappings) > 0 {
// Build the GID map, since writing to the proc file has to be done all at once.
g := new(bytes.Buffer)
for _, m := range c.GidMappings {
fmt.Fprintf(g, "%d %d %d\n", m.ContainerID, m.HostID, m.Size)
}
gidmapSet := false
// Set the GID map.
if c.UseNewgidmap {
cmd := exec.Command("newgidmap", append([]string{pidString}, strings.Fields(strings.Replace(g.String(), "\n", " ", -1))...)...)
g.Reset()
cmd.Stdout = g
cmd.Stderr = g
err := cmd.Run()
if err == nil {
gidmapSet = true
} else {
logrus.Warnf("error running newgidmap: %v: %s", err, g.String())
logrus.Warnf("falling back to single mapping")
g.Reset()
g.Write([]byte(fmt.Sprintf("0 %d 1\n", os.Getegid())))
}
}
if !gidmapSet {
if c.UseNewgidmap {
setgroups, err := os.OpenFile(fmt.Sprintf("/proc/%s/setgroups", pidString), os.O_TRUNC|os.O_WRONLY, 0)
if err != nil {
fmt.Fprintf(continueWrite, "error opening /proc/%s/setgroups: %v", pidString, err)
return errors.Wrapf(err, "error opening /proc/%s/setgroups", pidString)
}
defer setgroups.Close()
if _, err := fmt.Fprintf(setgroups, "deny"); err != nil {
fmt.Fprintf(continueWrite, "error writing 'deny' to /proc/%s/setgroups: %v", pidString, err)
return errors.Wrapf(err, "error writing 'deny' to /proc/%s/setgroups", pidString)
}
}
gidmap, err := os.OpenFile(fmt.Sprintf("/proc/%s/gid_map", pidString), os.O_TRUNC|os.O_WRONLY, 0)
if err != nil {
fmt.Fprintf(continueWrite, "error opening /proc/%s/gid_map: %v", pidString, err)
return errors.Wrapf(err, "error opening /proc/%s/gid_map", pidString)
}
defer gidmap.Close()
if _, err := fmt.Fprintf(gidmap, "%s", g.String()); err != nil {
fmt.Fprintf(continueWrite, "error writing %q to /proc/%s/gid_map: %v", g.String(), pidString, err)
return errors.Wrapf(err, "error writing %q to /proc/%s/gid_map", g.String(), pidString)
}
}
}
if len(c.UidMappings) > 0 {
// Build the UID map, since writing to the proc file has to be done all at once.
u := new(bytes.Buffer)
for _, m := range c.UidMappings {
fmt.Fprintf(u, "%d %d %d\n", m.ContainerID, m.HostID, m.Size)
}
uidmapSet := false
// Set the GID map.
if c.UseNewuidmap {
cmd := exec.Command("newuidmap", append([]string{pidString}, strings.Fields(strings.Replace(u.String(), "\n", " ", -1))...)...)
u.Reset()
cmd.Stdout = u
cmd.Stderr = u
err := cmd.Run()
if err == nil {
uidmapSet = true
} else {
logrus.Warnf("error running newuidmap: %v: %s", err, u.String())
logrus.Warnf("falling back to single mapping")
u.Reset()
u.Write([]byte(fmt.Sprintf("0 %d 1\n", os.Geteuid())))
}
}
if !uidmapSet {
uidmap, err := os.OpenFile(fmt.Sprintf("/proc/%s/uid_map", pidString), os.O_TRUNC|os.O_WRONLY, 0)
if err != nil {
fmt.Fprintf(continueWrite, "error opening /proc/%s/uid_map: %v", pidString, err)
return errors.Wrapf(err, "error opening /proc/%s/uid_map", pidString)
}
defer uidmap.Close()
if _, err := fmt.Fprintf(uidmap, "%s", u.String()); err != nil {
fmt.Fprintf(continueWrite, "error writing %q to /proc/%s/uid_map: %v", u.String(), pidString, err)
return errors.Wrapf(err, "error writing %q to /proc/%s/uid_map", u.String(), pidString)
}
}
}
}
if c.OOMScoreAdj != nil {
oomScoreAdj, err := os.OpenFile(fmt.Sprintf("/proc/%s/oom_score_adj", pidString), os.O_TRUNC|os.O_WRONLY, 0)
if err != nil {
fmt.Fprintf(continueWrite, "error opening oom_score_adj: %v", err)
return errors.Wrapf(err, "error opening /proc/%s/oom_score_adj", pidString)
}
defer oomScoreAdj.Close()
if _, err := fmt.Fprintf(oomScoreAdj, "%d\n", *c.OOMScoreAdj); err != nil {
fmt.Fprintf(continueWrite, "error writing \"%d\" to oom_score_adj: %v", c.OOMScoreAdj, err)
return errors.Wrapf(err, "error writing \"%d\" to /proc/%s/oom_score_adj", c.OOMScoreAdj, pidString)
}
}
// Run any additional setup that we want to do before the child starts running proper.
if c.Hook != nil {
if err = c.Hook(pid); err != nil {
fmt.Fprintf(continueWrite, "hook error: %v", err)
return err
}
}
return nil
}
func (c *Cmd) Run() error {
if err := c.Start(); err != nil {
return err
}
return c.Wait()
}
func (c *Cmd) CombinedOutput() ([]byte, error) {
return nil, errors.New("unshare: CombinedOutput() not implemented")
}
func (c *Cmd) Output() ([]byte, error) {
return nil, errors.New("unshare: Output() not implemented")
}
var (
isRootlessOnce sync.Once
isRootless bool
)
const (
// UsernsEnvName is the environment variable, if set indicates in rootless mode
UsernsEnvName = "_CONTAINERS_USERNS_CONFIGURED"
)
// IsRootless tells us if we are running in rootless mode
func IsRootless() bool {
isRootlessOnce.Do(func() {
isRootless = os.Geteuid() != 0 || os.Getenv(UsernsEnvName) != ""
})
return isRootless
}
// GetRootlessUID returns the UID of the user in the parent userNS
func GetRootlessUID() int {
uidEnv := os.Getenv("_CONTAINERS_ROOTLESS_UID")
if uidEnv != "" {
u, _ := strconv.Atoi(uidEnv)
return u
}
return os.Getuid()
}
// RootlessEnv returns the environment settings for the rootless containers
func RootlessEnv() []string {
return append(os.Environ(), UsernsEnvName+"=done")
}
type Runnable interface {
Run() error
}
func bailOnError(err error, format string, a ...interface{}) {
if err != nil {
if format != "" {
logrus.Errorf("%s: %v", fmt.Sprintf(format, a...), err)
} else {
logrus.Errorf("%v", err)
}
os.Exit(1)
}
}
// MaybeReexecUsingUserNamespace re-exec the process in a new namespace
func MaybeReexecUsingUserNamespace(evenForRoot bool) {
// If we've already been through this once, no need to try again.
if os.Geteuid() == 0 && IsRootless() {
return
}
var uidNum, gidNum uint64
// Figure out who we are.
me, err := user.Current()
if !os.IsNotExist(err) {
bailOnError(err, "error determining current user")
uidNum, err = strconv.ParseUint(me.Uid, 10, 32)
bailOnError(err, "error parsing current UID %s", me.Uid)
gidNum, err = strconv.ParseUint(me.Gid, 10, 32)
bailOnError(err, "error parsing current GID %s", me.Gid)
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
// ID mappings to use to reexec ourselves.
var uidmap, gidmap []specs.LinuxIDMapping
if uidNum != 0 || evenForRoot {
// Read the set of ID mappings that we're allowed to use. Each
// range in /etc/subuid and /etc/subgid file is a starting host
// ID and a range size.
uidmap, gidmap, err = GetSubIDMappings(me.Username, me.Username)
if err != nil {
logrus.Warnf("error reading allowed ID mappings: %v", err)
}
if len(uidmap) == 0 {
logrus.Warnf("Found no UID ranges set aside for user %q in /etc/subuid.", me.Username)
}
if len(gidmap) == 0 {
logrus.Warnf("Found no GID ranges set aside for user %q in /etc/subgid.", me.Username)
}
// Map our UID and GID, then the subuid and subgid ranges,
// consecutively, starting at 0, to get the mappings to use for
// a copy of ourselves.
uidmap = append([]specs.LinuxIDMapping{{HostID: uint32(uidNum), ContainerID: 0, Size: 1}}, uidmap...)
gidmap = append([]specs.LinuxIDMapping{{HostID: uint32(gidNum), ContainerID: 0, Size: 1}}, gidmap...)
var rangeStart uint32
for i := range uidmap {
uidmap[i].ContainerID = rangeStart
rangeStart += uidmap[i].Size
}
rangeStart = 0
for i := range gidmap {
gidmap[i].ContainerID = rangeStart
rangeStart += gidmap[i].Size
}
} else {
// If we have CAP_SYS_ADMIN, then we don't need to create a new namespace in order to be able
// to use unshare(), so don't bother creating a new user namespace at this point.
capabilities, err := capability.NewPid(0)
bailOnError(err, "error reading the current capabilities sets")
if capabilities.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) {
return
}
// Read the set of ID mappings that we're currently using.
uidmap, gidmap, err = GetHostIDMappings("")
bailOnError(err, "error reading current ID mappings")
// Just reuse them.
for i := range uidmap {
uidmap[i].HostID = uidmap[i].ContainerID
}
for i := range gidmap {
gidmap[i].HostID = gidmap[i].ContainerID
}
}
// Unlike most uses of reexec or unshare, we're using a name that
// _won't_ be recognized as a registered reexec handler, since we
// _want_ to fall through reexec.Init() to the normal main().
cmd := Command(append([]string{fmt.Sprintf("%s-in-a-user-namespace", os.Args[0])}, os.Args[1:]...)...)
// If, somehow, we don't become UID 0 in our child, indicate that the child shouldn't try again.
err = os.Setenv(UsernsEnvName, "1")
bailOnError(err, "error setting %s=1 in environment", UsernsEnvName)
// Set the default isolation type to use the "rootless" method.
if _, present := os.LookupEnv("BUILDAH_ISOLATION"); !present {
if err = os.Setenv("BUILDAH_ISOLATION", "rootless"); err != nil {
if err := os.Setenv("BUILDAH_ISOLATION", "rootless"); err != nil {
logrus.Errorf("error setting BUILDAH_ISOLATION=rootless in environment: %v", err)
os.Exit(1)
}
}
}
// Reuse our stdio.
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
// Set up a new user namespace with the ID mapping.
cmd.UnshareFlags = syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS
cmd.UseNewuidmap = uidNum != 0
cmd.UidMappings = uidmap
cmd.UseNewgidmap = uidNum != 0
cmd.GidMappings = gidmap
cmd.GidMappingsEnableSetgroups = true
// Finish up.
logrus.Debugf("running %+v with environment %+v, UID map %+v, and GID map %+v", cmd.Cmd.Args, os.Environ(), cmd.UidMappings, cmd.GidMappings)
ExecRunnable(cmd, nil)
}
// ExecRunnable runs the specified unshare command, captures its exit status,
// and exits with the same status.
func ExecRunnable(cmd Runnable, cleanup func()) {
exit := func(status int) {
if cleanup != nil {
cleanup()
}
os.Exit(status)
}
if err := cmd.Run(); err != nil {
if exitError, ok := errors.Cause(err).(*exec.ExitError); ok {
if exitError.ProcessState.Exited() {
if waitStatus, ok := exitError.ProcessState.Sys().(syscall.WaitStatus); ok {
if waitStatus.Exited() {
logrus.Errorf("%v", exitError)
exit(waitStatus.ExitStatus())
}
if waitStatus.Signaled() {
logrus.Errorf("%v", exitError)
exit(int(waitStatus.Signal()) + 128)
}
}
}
}
logrus.Errorf("%v", err)
logrus.Errorf("(unable to determine exit status)")
exit(1)
}
exit(0)
}
// getHostIDMappings reads mappings from the named node under /proc.
func getHostIDMappings(path string) ([]specs.LinuxIDMapping, error) {
var mappings []specs.LinuxIDMapping
f, err := os.Open(path)
if err != nil {
return nil, errors.Wrapf(err, "error reading ID mappings from %q", path)
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Fields(line)
if len(fields) != 3 {
return nil, errors.Errorf("line %q from %q has %d fields, not 3", line, path, len(fields))
}
cid, err := strconv.ParseUint(fields[0], 10, 32)
if err != nil {
return nil, errors.Wrapf(err, "error parsing container ID value %q from line %q in %q", fields[0], line, path)
}
hid, err := strconv.ParseUint(fields[1], 10, 32)
if err != nil {
return nil, errors.Wrapf(err, "error parsing host ID value %q from line %q in %q", fields[1], line, path)
}
size, err := strconv.ParseUint(fields[2], 10, 32)
if err != nil {
return nil, errors.Wrapf(err, "error parsing size value %q from line %q in %q", fields[2], line, path)
}
mappings = append(mappings, specs.LinuxIDMapping{ContainerID: uint32(cid), HostID: uint32(hid), Size: uint32(size)})
}
return mappings, nil
}
// GetHostIDMappings reads mappings for the specified process (or the current
// process if pid is "self" or an empty string) from the kernel.
func GetHostIDMappings(pid string) ([]specs.LinuxIDMapping, []specs.LinuxIDMapping, error) {
if pid == "" {
pid = "self"
}
uidmap, err := getHostIDMappings(fmt.Sprintf("/proc/%s/uid_map", pid))
if err != nil {
return nil, nil, err
}
gidmap, err := getHostIDMappings(fmt.Sprintf("/proc/%s/gid_map", pid))
if err != nil {
return nil, nil, err
}
return uidmap, gidmap, nil
}
// GetSubIDMappings reads mappings from /etc/subuid and /etc/subgid.
func GetSubIDMappings(user, group string) ([]specs.LinuxIDMapping, []specs.LinuxIDMapping, error) {
mappings, err := idtools.NewIDMappings(user, group)
if err != nil {
return nil, nil, errors.Wrapf(err, "error reading subuid mappings for user %q and subgid mappings for group %q", user, group)
}
var uidmap, gidmap []specs.LinuxIDMapping
for _, m := range mappings.UIDs() {
uidmap = append(uidmap, specs.LinuxIDMapping{
ContainerID: uint32(m.ContainerID),
HostID: uint32(m.HostID),
Size: uint32(m.Size),
})
}
for _, m := range mappings.GIDs() {
gidmap = append(gidmap, specs.LinuxIDMapping{
ContainerID: uint32(m.ContainerID),
HostID: uint32(m.HostID),
Size: uint32(m.Size),
})
}
return uidmap, gidmap, nil
}
// ParseIDMappings parses mapping triples.
func ParseIDMappings(uidmap, gidmap []string) ([]idtools.IDMap, []idtools.IDMap, error) {
uid, err := idtools.ParseIDMap(uidmap, "userns-uid-map")
if err != nil {
return nil, nil, err
}
gid, err := idtools.ParseIDMap(gidmap, "userns-gid-map")
if err != nil {
return nil, nil, err
}
return uid, gid, nil
}