vendor: update runc/libcontainer

This includes all of v0.0.8 as well as a few bug fixes that popped up
during vendoring.

Signed-off-by: Aleksa Sarai <asarai@suse.com>
This commit is contained in:
Aleksa Sarai 2016-01-26 18:05:13 -05:00
parent 7eed9a642e
commit 093dd39686
41 changed files with 1328 additions and 532 deletions

View File

@ -59,7 +59,7 @@ clone git github.com/miekg/pkcs11 80f102b5cac759de406949c47f0928b99bd64cdf
clone git github.com/docker/go v1.5.1-1-1-gbaf439e clone git github.com/docker/go v1.5.1-1-1-gbaf439e
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
clone git github.com/opencontainers/runc 3d8a20bb772defc28c355534d83486416d1719b4 # libcontainer clone git github.com/opencontainers/runc ce72f86a2b54bc114d6ffb51f6500479b2d42154 # libcontainer
clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1 clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1
# libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json) # libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
clone git github.com/coreos/go-systemd v4 clone git github.com/coreos/go-systemd v4

View File

@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
#### Using libcontainer #### Using libcontainer
To create a container you first have to initialize an instance of a factory Because containers are spawned in a two step process you will need a binary that
that will handle the creation and initialization for a container. will be executed as the init process for the container. In libcontainer, we use
the current binary (/proc/self/exe) to be executed as the init process, and use
Because containers are spawned in a two step process you will need to provide arg "init", we call the first step process "bootstrap", so you always need a "init"
arguments to a binary that will be executed as the init process for the container. function as the entry of "bootstrap".
To use the current binary that is spawning the containers and acting as the parent
you can use `os.Args[0]` and we have a command called `init` setup.
```go ```go
root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init")) func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
panic("--this line should have never been executed, congratulations--")
}
}
```
Then to create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
```go
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
return
} }
``` ```
Once you have an instance of the factory created we can create a configuration Once you have an instance of the factory created we can create a configuration
struct describing how the container is to be created. A sample would look similar to this: struct describing how the container is to be created. A sample would look similar to this:
```go ```go
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
config := &configs.Config{ config := &configs.Config{
Rootfs: rootfs, Rootfs: "/your/path/to/rootfs",
Capabilities: []string{ Capabilities: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Namespaces: configs.Namespaces([]configs.Namespace{ Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS}, {Type: configs.NEWNS},
{Type: configs.NEWUTS}, {Type: configs.NEWUTS},
{Type: configs.NEWIPC}, {Type: configs.NEWIPC},
{Type: configs.NEWPID}, {Type: configs.NEWPID},
{Type: configs.NEWNET}, {Type: configs.NEWUSER},
}), {Type: configs.NEWNET},
Cgroups: &configs.Cgroup{ }),
Name: "test-container", Cgroups: &configs.Cgroup{
Parent: "system", Name: "test-container",
AllowAllDevices: false, Parent: "system",
AllowedDevices: configs.DefaultAllowedDevices, Resources: &configs.Resources{
}, MemorySwappiness: -1,
AllowAllDevices: false,
Devices: configs.DefaultAutoCreatedDevices, AllowedDevices: configs.DefaultAllowedDevices,
Hostname: "testing", },
Networks: []*configs.Network{ },
{ MaskPaths: []string{
Type: "loopback", "/proc/kcore",
Address: "127.0.0.1/0", },
Gateway: "localhost", ReadonlyPaths: []string{
}, "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
}, },
Rlimits: []configs.Rlimit{ Devices: configs.DefaultAutoCreatedDevices,
{ Hostname: "testing",
Type: syscall.RLIMIT_NOFILE, Mounts: []*configs.Mount{
Hard: uint64(1024), {
Soft: uint64(1024), Source: "proc",
}, Destination: "/proc",
}, Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | syscall.MS_RDONLY,
},
},
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
} }
``` ```
Once you have the configuration populated you can create a container: Once you have the configuration populated you can create a container:
```go ```go
container, err := root.Create("container-id", config) container, err := factory.Create("container-id", config)
if err != nil {
logrus.Fatal(err)
return
}
``` ```
To spawn bash as the initial process inside the container and have the To spawn bash as the initial process inside the container and have the
@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
```go ```go
process := &libcontainer.Process{ process := &libcontainer.Process{
Args: []string{"/bin/bash"}, Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"}, Env: []string{"PATH=/bin"},
User: "daemon", User: "daemon",
Stdin: os.Stdin, Stdin: os.Stdin,
Stdout: os.Stdout, Stdout: os.Stdout,
Stderr: os.Stderr, Stderr: os.Stderr,
} }
err := container.Start(process) err := container.Start(process)
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
container.Destroy()
return
} }
// wait for the process to finish. // wait for the process to finish.
status, err := process.Wait() _, err := process.Wait()
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
} }
// destroy the container. // destroy the container.
@ -124,7 +211,6 @@ processes, err := container.Processes()
// it's processes. // it's processes.
stats, err := container.Stats() stats, err := container.Stats()
// pause all processes inside the container. // pause all processes inside the container.
container.Pause() container.Pause()

View File

@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
After a container's filesystems are mounted within the newly created After a container's filesystems are mounted within the newly created
mount namespace `/dev` will need to be populated with a set of device nodes. mount namespace `/dev` will need to be populated with a set of device nodes.
It is expected that a rootfs does not need to have any device nodes specified It is expected that a rootfs does not need to have any device nodes specified
for `/dev` witin the rootfs as the container will setup the correct devices for `/dev` within the rootfs as the container will setup the correct devices
that are required for executing a container's process. that are required for executing a container's process.
| Path | Mode | Access | | Path | Mode | Access |
@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
| perf_event | 1 | | perf_event | 1 |
| freezer | 1 | | freezer | 1 |
| hugetlb | 1 | | hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from All cgroup subsystem are joined so that statistics can be collected from
@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
| CAP_SYS_BOOT | 0 | | CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 | | CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 | | CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPE | 0 | | CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)

View File

@ -15,6 +15,9 @@ type Manager interface {
// Returns the PIDs inside the cgroup set // Returns the PIDs inside the cgroup set
GetPids() ([]int, error) GetPids() ([]int, error)
// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)
// Returns statistics for the cgroup set // Returns statistics for the cgroup set
GetStats() (*Stats, error) GetStats() (*Stats, error)

View File

@ -14,6 +14,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
var ( var (
@ -23,6 +24,7 @@ var (
&MemoryGroup{}, &MemoryGroup{},
&CpuGroup{}, &CpuGroup{},
&CpuacctGroup{}, &CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{}, &BlkioGroup{},
&HugetlbGroup{}, &HugetlbGroup{},
&NetClsGroup{}, &NetClsGroup{},
@ -93,11 +95,10 @@ func getCgroupRoot() (string, error) {
} }
type cgroupData struct { type cgroupData struct {
root string root string
parent string innerPath string
name string config *configs.Cgroup
config *configs.Cgroup pid int
pid int
} }
func (m *Manager) Apply(pid int) (err error) { func (m *Manager) Apply(pid int) (err error) {
@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
return err return err
} }
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
paths := make(map[string]string) paths := make(map[string]string)
defer func() { defer func() {
if err != nil { if err != nil {
@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
paths[sys.Name()] = p paths[sys.Name()] = p
} }
m.Paths = paths m.Paths = paths
if paths["cpu"] != "" {
if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.Paths); err != nil { if err := cgroups.RemovePaths(m.Paths); err != nil {
@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths { for _, sys := range subsystems {
sys, err := subsystems.Get(name) // Generate fake cgroup data.
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { d, err := getCgroupData(container.Cgroups, -1)
continue if err != nil {
return err
} }
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil { if err := sys.Set(path, container.Cgroups); err != nil {
return err return err
} }
} }
if m.Paths["cpu"] != "" {
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
@ -217,41 +243,28 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
} }
func (m *Manager) GetPids() ([]int, error) { func (m *Manager) GetPids() ([]int, error) {
d, err := getCgroupData(m.Cgroups, 0) dir, err := getCgroupPath(m.Cgroups)
if err != nil { if err != nil {
return nil, err return nil, err
} }
dir, err := d.path("devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(dir) return cgroups.GetPids(dir)
} }
// pathClean makes a path safe for use with filepath.Join. This is done by not func (m *Manager) GetAllPids() ([]int, error) {
// only cleaning the path, but also (if the path is relative) adding a leading dir, err := getCgroupPath(m.Cgroups)
// '/' and cleaning it (then removing the leading '/'). This ensures that a if err != nil {
// path resulting from prepending another path will always resolve to lexically return nil, err
// be a subdirectory of the prefixed path. This is all done lexically, so paths }
// that include symlinks won't be safe as a result of using pathClean. return cgroups.GetAllPids(dir)
func pathClean(path string) string { }
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths func getCgroupPath(c *configs.Cgroup) (string, error) {
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute d, err := getCgroupData(c, 0)
// paths to relative ones. if err != nil {
if !filepath.IsAbs(path) { return "", err
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
} }
// Clean the path again for good measure. return d.path("devices")
return filepath.Clean(path)
} }
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
@ -260,15 +273,25 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
return nil, err return nil, err
} }
// Clean the parent slice path. if (c.Name != "" || c.Parent != "") && c.Path != "" {
c.Parent = pathClean(c.Parent) return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{ return &cgroupData{
root: root, root: root,
parent: c.Parent, innerPath: innerPath,
name: c.Name, config: c,
config: c, pid: pid,
pid: pid,
}, nil }, nil
} }
@ -296,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err return "", err
} }
cgPath := filepath.Join(raw.parent, raw.name)
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process. // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgPath) { if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
} }
parentPath, err := raw.parentPath(subsystem, mnt, root) parentPath, err := raw.parentPath(subsystem, mnt, root)
@ -308,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err return "", err
} }
return filepath.Join(parentPath, cgPath), nil return filepath.Join(parentPath, raw.innerPath), nil
} }
func (raw *cgroupData) join(subsystem string) (string, error) { func (raw *cgroupData) join(subsystem string) (string, error) {

View File

@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string {
} }
func (s *BlkioGroup) Apply(d *cgroupData) error { func (s *BlkioGroup) Apply(d *cgroupData) error {
dir, err := d.join("blkio") _, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string {
func (s *CpuGroup) Apply(d *cgroupData) error { func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling // We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis // on a container basis
dir, err := d.join("cpu") _, err := d.join("cpu")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
type CpusetGroup struct { type CpusetGroup struct {
@ -64,11 +65,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err := s.ensureParent(dir, root); err != nil { if err := s.ensureParent(dir, root); err != nil {
return err return err
} }
// the default values inherit from parent cgroup are already set in
// s.ensureParent, cover these if we have our own
if err := s.Set(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
@ -93,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
// it's parent. // it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error { func (s *CpusetGroup) ensureParent(current, root string) error {
parent := filepath.Dir(current) parent := filepath.Dir(current)
if filepath.Clean(parent) == root { if libcontainerUtils.CleanPath(parent) == root {
return nil return nil
} }
// Avoid infinite recursion. // Avoid infinite recursion.

View File

@ -15,21 +15,29 @@ func (s *DevicesGroup) Name() string {
} }
func (s *DevicesGroup) Apply(d *cgroupData) error { func (s *DevicesGroup) Apply(d *cgroupData) error {
dir, err := d.join("devices") _, err := d.join("devices")
if err != nil { if err != nil {
// We will return error even it's `not found` error, devices // We will return error even it's `not found` error, devices
// cgroup is hard requirement for container's security. // cgroup is hard requirement for container's security.
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := writeFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if !cgroup.Resources.AllowAllDevices { if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil { if err := writeFile(path, "devices.deny", "a"); err != nil {
return err return err

View File

@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string {
} }
func (s *FreezerGroup) Apply(d *cgroupData) error { func (s *FreezerGroup) Apply(d *cgroupData) error {
dir, err := d.join("freezer") _, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string {
} }
func (s *HugetlbGroup) Apply(d *cgroupData) error { func (s *HugetlbGroup) Apply(d *cgroupData) error {
dir, err := d.join("hugetlb") _, err := d.join("hugetlb")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return err return err
} }
} }
// We have to set kernel memory here, as we can't change it once
if err := s.Set(path, d.config); err != nil { // processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err return err
} }
} }
@ -50,7 +51,17 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
}
return nil return nil
} }
@ -70,12 +81,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
} }
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable { if cgroup.Resources.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil { if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err return err
@ -157,6 +162,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage) value, err := getCgroupParamUint(path, usage)
if err != nil { if err != nil {
@ -182,6 +188,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
} }
memoryData.Failcnt = value memoryData.Failcnt = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil return memoryData, nil
} }

View File

@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string {
} }
func (s *NetClsGroup) Apply(d *cgroupData) error { func (s *NetClsGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_cls") _, err := d.join("net_cls")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string {
} }
func (s *NetPrioGroup) Apply(d *cgroupData) error { func (s *NetPrioGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_prio") _, err := d.join("net_prio")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }

View File

@ -0,0 +1,57 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct {
}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := writeFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
return nil
}

View File

@ -36,7 +36,9 @@ type MemoryData struct {
Usage uint64 `json:"usage,omitempty"` Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"` Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
} }
type MemoryStats struct { type MemoryStats struct {
// memory used for cache // memory used for cache
Cache uint64 `json:"cache,omitempty"` Cache uint64 `json:"cache,omitempty"`
@ -49,6 +51,11 @@ type MemoryStats struct {
Stats map[string]uint64 `json:"stats,omitempty"` Stats map[string]uint64 `json:"stats,omitempty"`
} }
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
}
type BlkioStatEntry struct { type BlkioStatEntry struct {
Major uint64 `json:"major,omitempty"` Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"` Minor uint64 `json:"minor,omitempty"`
@ -80,6 +87,7 @@ type HugetlbStats struct {
type Stats struct { type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"` CpuStats CpuStats `json:"cpu_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
// the map is in the format "size of hugepage: stats of the hugepage" // the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`

View File

@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported") return nil, fmt.Errorf("Systemd not supported")
} }
func (m *Manager) GetAllPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
return fmt.Errorf("Systemd not supported") return fmt.Errorf("Systemd not supported")
} }

View File

@ -55,6 +55,7 @@ var subsystems = subsystemSet{
&fs.MemoryGroup{}, &fs.MemoryGroup{},
&fs.CpuGroup{}, &fs.CpuGroup{},
&fs.CpuacctGroup{}, &fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{}, &fs.BlkioGroup{},
&fs.HugetlbGroup{}, &fs.HugetlbGroup{},
&fs.PerfEventGroup{}, &fs.PerfEventGroup{},
@ -167,6 +168,23 @@ func (m *Manager) Apply(pid int) error {
properties []systemdDbus.Property properties []systemdDbus.Property
) )
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.Cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
if c.Parent != "" { if c.Parent != "" {
slice = c.Parent slice = c.Parent
} }
@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
return err return err
} }
// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api. // because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil { if err := joinFreezer(c, pid); err != nil {
return err return err
@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
return err return err
} }
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil { if err := joinCpuset(c, pid); err != nil {
return err return err
} }
@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
paths[s.Name()] = subsystemPath paths[s.Name()] = subsystemPath
} }
m.Paths = paths m.Paths = paths
if paths["cpu"] != "" {
if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
@ -330,68 +348,74 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
} }
func joinCpu(c *configs.Cgroup, pid int) error { func joinCpu(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpu") _, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if c.Resources.CpuQuota != 0 {
if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil {
return err
}
}
if c.Resources.CpuPeriod != 0 {
if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
if c.Resources.CpuRtPeriod != 0 {
if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if c.Resources.CpuRtRuntime != 0 {
if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil return nil
} }
func joinFreezer(c *configs.Cgroup, pid int) error { func joinFreezer(c *configs.Cgroup, pid int) error {
path, err := join(c, "freezer", pid) _, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
freezer, err := subsystems.Get("freezer") return nil
if err != nil {
return err
}
return freezer.Set(path, c)
} }
func joinNetPrio(c *configs.Cgroup, pid int) error { func joinNetPrio(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_prio", pid) _, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
netPrio, err := subsystems.Get("net_prio") return nil
if err != nil {
return err
}
return netPrio.Set(path, c)
} }
func joinNetCls(c *configs.Cgroup, pid int) error { func joinNetCls(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_cls", pid) _, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
netcls, err := subsystems.Get("net_cls") return nil
if err != nil { }
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
return netcls.Set(path, c) return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Append the component to the path and to the prefix.
path += prefix + component + suffix + "/"
prefix += component + "-"
}
return path, nil
} }
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
@ -410,6 +434,11 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
slice = c.Parent slice = c.Parent
} }
slice, err = expandSlice(slice)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
} }
@ -440,6 +469,14 @@ func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(path) return cgroups.GetPids(path)
} }
func (m *Manager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(path)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) { func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
@ -458,16 +495,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths { for _, sys := range subsystems {
sys, err := subsystems.Get(name) // Get the subsystem path, but don't error out for not found cgroups.
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { path, err := getSubsystemPath(container.Cgroups, sys.Name())
continue if err != nil && !cgroups.IsNotFound(err) {
return err
} }
if err := sys.Set(path, container.Cgroups); err != nil { if err := sys.Set(path, container.Cgroups); err != nil {
return err return err
} }
} }
if m.Paths["cpu"] != "" {
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
@ -487,17 +531,13 @@ func getUnitName(c *configs.Cgroup) string {
// because systemd will re-write the device settings if it needs to re-apply the cgroup context. // because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started. // This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error { func joinDevices(c *configs.Cgroup, pid int) error {
path, err := join(c, "devices", pid) _, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup // Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security. // is hard requirement for container security.
if err != nil { if err != nil {
return err return err
} }
devices, err := subsystems.Get("devices") return nil
if err != nil {
return err
}
return devices.Set(path, c)
} }
func setKernelMemory(c *configs.Cgroup) error { func setKernelMemory(c *configs.Cgroup) error {
@ -510,52 +550,16 @@ func setKernelMemory(c *configs.Cgroup) error {
return err return err
} }
if c.Resources.KernelMemory > 0 { // This doesn't get called by manager.Set, so we need to do it here.
err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)) s := &fs.MemoryGroup{}
if err != nil { return s.SetKernelMemory(path, c)
return err
}
}
return nil
} }
func joinMemory(c *configs.Cgroup, pid int) error { func joinMemory(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "memory") _, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
// -1 disables memoryswap
if c.Resources.MemorySwap > 0 {
err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10))
if err != nil {
return err
}
}
if c.Resources.MemoryReservation > 0 {
err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10))
if err != nil {
return err
}
}
if c.Resources.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 {
err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10))
if err != nil {
return err
}
} else if c.Resources.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.Resources.MemorySwappiness)
}
return nil return nil
} }
@ -577,68 +581,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
// expects device path instead of major minor numbers, which is also confusing // expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now. // for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error { func joinBlkio(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "blkio") _, err := join(c, "blkio", pid)
if err != nil { if err != nil {
return err return err
} }
// systemd doesn't directly support this in the dbus properties
if c.Resources.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range c.Resources.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range c.Resources.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.Resources.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.Resources.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil return nil
} }
func joinHugetlb(c *configs.Cgroup, pid int) error { func joinHugetlb(c *configs.Cgroup, pid int) error {
path, err := join(c, "hugetlb", pid) _, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
hugetlb, err := subsystems.Get("hugetlb") return nil
if err != nil {
return err
}
return hugetlb.Set(path, c)
} }
func joinPerfEvent(c *configs.Cgroup, pid int) error { func joinPerfEvent(c *configs.Cgroup, pid int) error {
path, err := join(c, "perf_event", pid) _, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
perfEvent, err := subsystems.Get("perf_event") return nil
if err != nil {
return err
}
return perfEvent.Set(path, c)
} }

View File

@ -5,6 +5,7 @@ package cgroups
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
@ -12,7 +13,6 @@ import (
"strings" "strings"
"time" "time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/go-units" "github.com/docker/go-units"
) )
@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
// Safe as mountinfo encodes mountpoints with spaces as \040. // Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ") index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:]) postSeparatorFields := strings.Fields(text[index+3:])
if len(postSeparatorFields) < 3 { numPostFields := len(postSeparatorFields)
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
// This is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
} }
if postSeparatorFields[0] == "cgroup" { if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return filepath.Dir(fields[4]), nil return filepath.Dir(fields[4]), nil
} }
} }
@ -112,11 +121,45 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups) return getControllerPath(m.Subsystems[0], cgroups)
} }
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
for scanner.Scan() {
txt := scanner.Text()
sepIdx := strings.IndexByte(txt, '-')
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+2:sepIdx+8] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if ss[opt] {
m.Subsystems = append(m.Subsystems, opt)
}
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
}
return res, nil
}
func GetCgroupMounts() ([]Mount, error) { func GetCgroupMounts() ([]Mount, error) {
mounts, err := mount.GetMounts() f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer f.Close()
all, err := GetAllSubsystems() all, err := GetAllSubsystems()
if err != nil { if err != nil {
@ -127,24 +170,7 @@ func GetCgroupMounts() ([]Mount, error) {
for _, s := range all { for _, s := range all {
allMap[s] = true allMap[s] = true
} }
return getCgroupMountsHelper(allMap, f)
res := []Mount{}
for _, mount := range mounts {
if mount.Fstype == "cgroup" {
m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
for _, opt := range strings.Split(mount.VfsOpts, ",") {
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if allMap[opt] {
m.Subsystems = append(m.Subsystems, opt)
}
}
res = append(res, m)
}
}
return res, nil
} }
// Returns all the cgroup subsystems supported by the kernel // Returns all the cgroup subsystems supported by the kernel
@ -323,9 +349,14 @@ func GetHugePageSize() ([]string, error) {
return pageSizes, nil return pageSizes, nil
} }
// GetPids returns all pids, that were added to cgroup at path and to all its // GetPids returns all pids, that were added to cgroup at path.
// subcgroups.
func GetPids(path string) ([]int, error) { func GetPids(path string) ([]int, error) {
return readProcsFile(path)
}
// GetAllPids returns all pids, that were added to cgroup at path and to all its
// subcgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int var pids []int
// collect pids from all sub-cgroups // collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {

View File

@ -11,25 +11,38 @@ const (
) )
type Cgroup struct { type Cgroup struct {
Name string `json:"name"` // Deprecated, use Path instead
Name string `json:"name,omitempty"`
// name of parent cgroup or slice // name of parent of cgroup or slice
Parent string `json:"parent"` // Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name // ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"` ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string
// Resources contains various cgroups settings to apply // Resources contains various cgroups settings to apply
*Resources *Resources
} }
type Resources struct { type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"` // Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
AllowedDevices []*Device `json:"allowed_devices"` Devices []*Device `json:"devices"`
DeniedDevices []*Device `json:"denied_devices"`
// Memory limit (in bytes) // Memory limit (in bytes)
Memory int64 `json:"memory"` Memory int64 `json:"memory"`
@ -37,7 +50,7 @@ type Resources struct {
// Memory reservation or soft_limit (in bytes) // Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"` MemoryReservation int64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1' to disable swap // Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"` MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes) // Kernel memory limit (in bytes)
@ -64,6 +77,9 @@ type Resources struct {
// MEM to use // MEM to use
CpusetMems string `json:"cpuset_mems"` CpusetMems string `json:"cpuset_mems"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
// Specifies per cgroup weight, range is from 10 to 1000. // Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight"` BlkioWeight uint16 `json:"blkio_weight"`

View File

@ -171,6 +171,9 @@ type Config struct {
// A default action to be taken if no rules match is also given. // A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"` Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges"`
// Hooks are a collection of actions to perform at various container lifecycle events. // Hooks are a collection of actions to perform at various container lifecycle events.
// Hooks are not able to be marshaled to json but they are also not needed to. // Hooks are not able to be marshaled to json but they are also not needed to.
Hooks *Hooks `json:"-"` Hooks *Hooks `json:"-"`

View File

@ -35,6 +35,9 @@ type Device struct {
// Gid of the device. // Gid of the device.
Gid uint32 `json:"gid"` Gid uint32 `json:"gid"`
// Write the file to the allowed list
Allow bool `json:"allow"`
} }
func (d *Device) CgroupString() string { func (d *Device) CgroupString() string {

View File

@ -82,20 +82,6 @@ var (
Minor: 1, Minor: 1,
Permissions: "rwm", Permissions: "rwm",
}, },
{
Path: "/dev/tty0",
Type: 'c',
Major: 4,
Minor: 0,
Permissions: "rwm",
},
{
Path: "/dev/tty1",
Type: 'c',
Major: 4,
Minor: 1,
Permissions: "rwm",
},
// /dev/pts/ - pts namespaces are "coming soon" // /dev/pts/ - pts namespaces are "coming soon"
{ {
Path: "", Path: "",

View File

@ -6,6 +6,7 @@ package libcontainer
import ( import (
"os" "os"
"time"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
@ -14,8 +15,11 @@ import (
type Status int type Status int
const ( const (
// The container exists but has not been run yet
Created Status = iota
// The container exists and is running. // The container exists and is running.
Running Status = iota + 1 Running
// The container exists, it is in the process of being paused. // The container exists, it is in the process of being paused.
Pausing Pausing
@ -30,6 +34,25 @@ const (
Destroyed Destroyed
) )
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Checkpointed:
return "checkpointed"
case Destroyed:
return "destroyed"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a // BaseState represents the platform agnostic pieces relating to a
// running container's state // running container's state
type BaseState struct { type BaseState struct {
@ -39,9 +62,12 @@ type BaseState struct {
// InitProcessPid is the init process id in the parent namespace. // InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"` InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time. // InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime string `json:"init_process_start"` InitProcessStartTime string `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration. // Config is the container's configuration.
Config configs.Config `json:"config"` Config configs.Config `json:"config"`
} }

View File

@ -15,6 +15,7 @@ import (
"strings" "strings"
"sync" "sync"
"syscall" "syscall"
"time"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
@ -38,6 +39,8 @@ type linuxContainer struct {
criuPath string criuPath string
m sync.Mutex m sync.Mutex
criuVersion int criuVersion int
state containerState
created time.Time
} }
// State represents a running container's state // State represents a running container's state
@ -104,6 +107,12 @@ type Container interface {
// errors: // errors:
// Systemerror - System error. // Systemerror - System error.
NotifyOOM() (<-chan struct{}, error) NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
//
// errors:
// Systemerror - System error.
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
} }
// ID returns the container's unique ID // ID returns the container's unique ID
@ -129,7 +138,7 @@ func (c *linuxContainer) State() (*State, error) {
} }
func (c *linuxContainer) Processes() ([]int, error) { func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetPids() pids, err := c.cgroupManager.GetAllPids()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemError(err)
} }
@ -183,22 +192,30 @@ func (c *linuxContainer) Start(process *Process) error {
} }
return newSystemError(err) return newSystemError(err)
} }
if doInit { // generate a timestamp indicating when the container was started
c.updateState(parent) c.created = time.Now().UTC()
c.state = &runningState{
c: c,
} }
if c.config.Hooks != nil { if doInit {
s := configs.HookState{ if err := c.updateState(parent); err != nil {
Version: c.config.Version, return err
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
} }
for _, hook := range c.config.Hooks.Poststart { if c.config.Hooks != nil {
if err := hook.Run(s); err != nil { s := configs.HookState{
if err := parent.terminate(); err != nil { Version: c.config.Version,
logrus.Warn(err) ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
} }
return newSystemError(err)
} }
} }
} }
@ -251,7 +268,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
} }
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=standard" t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags() cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 { if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
@ -278,7 +295,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
} }
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns") cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
// for setns process, we dont have to set cloneflags as the process namespaces // for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall // will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
@ -321,54 +338,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
func (c *linuxContainer) Destroy() error { func (c *linuxContainer) Destroy() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
status, err := c.currentStatus() return c.state.destroy()
if err != nil {
return err
}
if status != Destroyed {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err = c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return err
} }
func (c *linuxContainer) Pause() error { func (c *linuxContainer) Pause() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Frozen) status, err := c.currentStatus()
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
} }
func (c *linuxContainer) Resume() error { func (c *linuxContainer) Resume() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Thawed) status, err := c.currentStatus()
if err != nil {
return err
}
if status != Paused {
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
}
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return c.state.transition(&runningState{
c: c,
})
} }
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
return notifyOnOOM(c.cgroupManager.GetPaths()) return notifyOnOOM(c.cgroupManager.GetPaths())
} }
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done. // XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) { func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" { if e := os.Getenv(evar); e != "" {
@ -460,7 +476,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
if criuOpts.ImagesDirectory == "" { if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") return fmt.Errorf("invalid directory to save checkpoint")
} }
// Since a container can be C/R'ed multiple times, // Since a container can be C/R'ed multiple times,
@ -579,11 +595,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
if err := c.checkCriuVersion("1.5.2"); err != nil { if err := c.checkCriuVersion("1.5.2"); err != nil {
return err return err
} }
if criuOpts.WorkDirectory == "" { if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
} }
@ -592,22 +606,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
return err return err
} }
workDir, err := os.Open(criuOpts.WorkDirectory) workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil { if err != nil {
return err return err
} }
defer workDir.Close() defer workDir.Close()
if criuOpts.ImagesDirectory == "" { if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") return fmt.Errorf("invalid directory to restore checkpoint")
} }
imageDir, err := os.Open(criuOpts.ImagesDirectory) imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil { if err != nil {
return err return err
} }
defer imageDir.Close() defer imageDir.Close()
// CRIU has a few requirements for a root directory: // CRIU has a few requirements for a root directory:
// * it must be a mount point // * it must be a mount point
// * its parent must not be overmounted // * its parent must not be overmounted
@ -618,18 +629,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
return err return err
} }
defer os.Remove(root) defer os.Remove(root)
root, err = filepath.EvalSymlinks(root) root, err = filepath.EvalSymlinks(root)
if err != nil { if err != nil {
return err return err
} }
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
if err != nil { if err != nil {
return err return err
} }
defer syscall.Unmount(root, syscall.MNT_DETACH) defer syscall.Unmount(root, syscall.MNT_DETACH)
t := criurpc.CriuReqType_RESTORE t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{ req := &criurpc.CriuReq{
Type: &t, Type: &t,
@ -697,15 +705,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
fds []string fds []string
fdJSON []byte fdJSON []byte
) )
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err return err
} }
if err = json.Unmarshal(fdJSON, &fds); err != nil { if err := json.Unmarshal(fdJSON, &fds); err != nil {
return err return err
} }
for i := range fds { for i := range fds {
if s := fds[i]; strings.Contains(s, "pipe:") { if s := fds[i]; strings.Contains(s, "pipe:") {
inheritFd := new(criurpc.InheritFd) inheritFd := new(criurpc.InheritFd)
@ -714,12 +720,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
} }
} }
return c.criuSwrk(process, req, criuOpts, true)
err = c.criuSwrk(process, req, criuOpts, true)
if err != nil {
return err
}
return nil
} }
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -914,46 +915,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if notify == nil { if notify == nil {
return fmt.Errorf("invalid response: %s", resp.String()) return fmt.Errorf("invalid response: %s", resp.String())
} }
switch { switch {
case notify.GetScript() == "post-dump": case notify.GetScript() == "post-dump":
if !opts.LeaveRunning { f, err := os.Create(filepath.Join(c.root, "checkpoint"))
f, err := os.Create(filepath.Join(c.root, "checkpoint")) if err != nil {
if err != nil { return err
return err
}
f.Close()
} }
break f.Close()
case notify.GetScript() == "network-unlock": case notify.GetScript() == "network-unlock":
if err := unlockNetwork(c.config); err != nil { if err := unlockNetwork(c.config); err != nil {
return err return err
} }
break
case notify.GetScript() == "network-lock": case notify.GetScript() == "network-lock":
if err := lockNetwork(c.config); err != nil { if err := lockNetwork(c.config); err != nil {
return err return err
} }
break
case notify.GetScript() == "post-restore": case notify.GetScript() == "post-restore":
pid := notify.GetPid() pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds) r, err := newRestoredProcess(int(pid), fds)
if err != nil { if err != nil {
return err return err
} }
process.ops = r
// TODO: crosbymichael restore previous process information by saving the init process information in if err := c.state.transition(&restoredState{
// the container's state file or separate process state files. imageDir: opts.ImagesDirectory,
c: c,
}); err != nil {
return err
}
if err := c.updateState(r); err != nil { if err := c.updateState(r); err != nil {
return err return err
} }
process.ops = r if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
break if !os.IsNotExist(err) {
logrus.Error(err)
}
}
} }
return nil return nil
} }
@ -963,65 +961,108 @@ func (c *linuxContainer) updateState(process parentProcess) error {
if err != nil { if err != nil {
return err return err
} }
return c.saveState(state)
}
func (c *linuxContainer) saveState(s *State) error {
f, err := os.Create(filepath.Join(c.root, stateFilename)) f, err := os.Create(filepath.Join(c.root, stateFilename))
if err != nil { if err != nil {
return err return err
} }
defer f.Close() defer f.Close()
os.Remove(filepath.Join(c.root, "checkpoint")) return utils.WriteJSON(f, s)
return utils.WriteJSON(f, state) }
func (c *linuxContainer) deleteState() error {
return os.Remove(filepath.Join(c.root, stateFilename))
} }
func (c *linuxContainer) currentStatus() (Status, error) { func (c *linuxContainer) currentStatus() (Status, error) {
if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil { if err := c.refreshState(); err != nil {
return Checkpointed, nil return -1, err
} }
return c.state.status(), nil
}
// refreshState needs to be called to verify that the current state on the
// container is what is true. Because consumers of libcontainer can use it
// out of process we need to verify the container's status based on runtime
// information and not rely on our in process info.
func (c *linuxContainer) refreshState() error {
paused, err := c.isPaused()
if err != nil {
return err
}
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
if err != nil {
return err
}
if running {
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil { if c.initProcess == nil {
return Destroyed, nil return false, nil
} }
// return Running if the init process is alive // return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil { if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH { if err == syscall.ESRCH {
return Destroyed, nil return false, nil
} }
return 0, newSystemError(err) return false, newSystemError(err)
} }
if c.config.Cgroups != nil && c.config.Cgroups.Resources != nil && c.config.Cgroups.Resources.Freezer == configs.Frozen { return true, nil
return Paused, nil }
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
} }
return Running, nil return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
} }
func (c *linuxContainer) currentState() (*State, error) { func (c *linuxContainer) currentState() (*State, error) {
status, err := c.currentStatus() var (
if err != nil { startTime string
return nil, err externalDescriptors []string
} pid = -1
if status == Destroyed { )
return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists) if c.initProcess != nil {
} pid = c.initProcess.pid()
startTime, err := c.initProcess.startTime() startTime, _ = c.initProcess.startTime()
if err != nil { externalDescriptors = c.initProcess.externalDescriptors()
return nil, newSystemError(err)
} }
state := &State{ state := &State{
BaseState: BaseState{ BaseState: BaseState{
ID: c.ID(), ID: c.ID(),
Config: *c.config, Config: *c.config,
InitProcessPid: c.initProcess.pid(), InitProcessPid: pid,
InitProcessStartTime: startTime, InitProcessStartTime: startTime,
Created: c.created,
}, },
CgroupPaths: c.cgroupManager.GetPaths(), CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string), NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: c.initProcess.externalDescriptors(), ExternalDescriptors: externalDescriptors,
} }
for _, ns := range c.config.Namespaces { if pid > 0 {
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) for _, ns := range c.config.Namespaces {
} state.NamespacePaths[ns.Type] = ns.GetPath(pid)
for _, nsType := range configs.NamespaceTypes() { }
if _, ok := state.NamespacePaths[nsType]; !ok { for _, nsType := range configs.NamespaceTypes() {
ns := configs.Namespace{Type: nsType} if _, ok := state.NamespacePaths[nsType]; !ok {
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
} }
} }
return state, nil return state, nil

View File

@ -16,9 +16,10 @@ const (
ContainerPaused ContainerPaused
ContainerNotStopped ContainerNotStopped
ContainerNotRunning ContainerNotRunning
ContainerNotPaused
// Process errors // Process errors
ProcessNotExecuted NoProcessOps
// Common errors // Common errors
ConfigInvalid ConfigInvalid
@ -46,6 +47,10 @@ func (c ErrorCode) String() string {
return "Container is not running" return "Container is not running"
case ConsoleExists: case ConsoleExists:
return "Console exists for process" return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default: default:
return "Unknown error" return "Unknown error"
} }

View File

@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.MkdirAll(containerRoot, 0700); err != nil { if err := os.MkdirAll(containerRoot, 0700); err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
return &linuxContainer{ c := &linuxContainer{
id: id, id: id,
root: containerRoot, root: containerRoot,
config: config, config: config,
@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
initArgs: l.InitArgs, initArgs: l.InitArgs,
criuPath: l.CriuPath, criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}, nil }
c.state = &stoppedState{c: c}
return c, nil
} }
func (l *LinuxFactory) Load(id string) (Container, error) { func (l *LinuxFactory) Load(id string) (Container, error) {
@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime, processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors, fds: state.ExternalDescriptors,
} }
return &linuxContainer{ c := &linuxContainer{
initProcess: r, initProcess: r,
id: id, id: id,
config: &state.Config, config: &state.Config,
@ -200,7 +202,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
criuPath: l.CriuPath, criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot, root: containerRoot,
}, nil created: state.Created,
}
c.state = &createdState{c: c, s: Created}
if err := c.refreshState(); err != nil {
return nil, err
}
return c, nil
} }
func (l *LinuxFactory) Type() string { func (l *LinuxFactory) Type() string {
@ -222,18 +230,25 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
var i initer
defer func() { defer func() {
// if we have an error during the initialization of the container's init then send it back to the // We have an error during the initialization of the container's init,
// parent process in the form of an initError. // send it back to the parent process in the form of an initError.
if err != nil { // If container's init successed, syscall.Exec will not return, hence
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { // this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err) panic(err)
} }
} }
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
// ensure that this pipe is always closed // ensure that this pipe is always closed
pipe.Close() pipe.Close()
}() }()
i, err := newContainerInit(it, pipe) i, err = newContainerInit(it, pipe)
if err != nil { if err != nil {
return err return err
} }

View File

@ -9,6 +9,18 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace" "github.com/opencontainers/runc/libcontainer/stacktrace"
) )
type syncType uint8
const (
procReady syncType = iota
procError
procRun
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}} Code: {{.ECode}}
{{if .Message }} {{if .Message }}

View File

@ -5,6 +5,7 @@ package libcontainer
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"net" "net"
"os" "os"
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil }, nil
case initStandard: case initStandard:
return &linuxStandardInit{ return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(), parentPid: syscall.Getppid(),
config: config, config: config,
}, nil }, nil
@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
return nil return nil
} }
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and // joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace. // does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error { func joinExistingNamespaces(namespaces []configs.Namespace) error {
@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
if err := m.Freeze(configs.Frozen); err != nil { if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
pids, err := m.GetPids() pids, err := m.GetAllPids()
if err != nil { if err != nil {
m.Freeze(configs.Thawed) m.Freeze(configs.Thawed)
return err return err

View File

@ -0,0 +1,67 @@
// +build linux
package keyctl
import (
"fmt"
"syscall"
"strings"
"strconv"
"unsafe"
)
const KEYCTL_JOIN_SESSION_KEYRING = 1
const KEYCTL_SETPERM = 5
const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte = nil
var err error
if len(name) > 0 {
_name, err = syscall.BytePtrFromString(name)
if err != nil {
return KeySerial(0), err
}
}
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
if errn != 0 {
return 0, fmt.Errorf("could not create session key: %v", errn)
}
return KeySerial(sessKeyId), nil
}
// modify permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest := make([]byte, 1024)
destBytes := unsafe.Pointer(&dest[0])
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
return err
}
res := strings.Split(string(dest), ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
return err
}
return nil
}

View File

@ -12,31 +12,32 @@ import (
const oomCgroupName = "memory" const oomCgroupName = "memory"
// notifyOnOOM returns channel on which you can expect event about OOM, type PressureLevel uint
// if process died without OOM this channel will be closed.
// s is current *libcontainer.State for container. const (
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { LowPressure PressureLevel = iota
dir := paths[oomCgroupName] MediumPressure
if dir == "" { CriticalPressure
return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName) )
}
oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil { if err != nil {
return nil, err return nil, err
} }
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
if syserr != 0 { if syserr != 0 {
oomControl.Close() evFile.Close()
return nil, syserr return nil, syserr
} }
eventfd := os.NewFile(fd, "eventfd") eventfd := os.NewFile(fd, "eventfd")
eventControlPath := filepath.Join(dir, "cgroup.event_control") eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close() eventfd.Close()
oomControl.Close() evFile.Close()
return nil, err return nil, err
} }
ch := make(chan struct{}) ch := make(chan struct{})
@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
defer func() { defer func() {
close(ch) close(ch)
eventfd.Close() eventfd.Close()
oomControl.Close() evFile.Close()
}() }()
buf := make([]byte, 8) buf := make([]byte, 8)
for { for {
@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
}() }()
return ch, nil return ch, nil
} }
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View File

@ -17,6 +17,7 @@
#include <sched.h> #include <sched.h>
#include <signal.h> #include <signal.h>
#include <bits/sockaddr.h>
#include <linux/netlink.h> #include <linux/netlink.h>
#include <linux/types.h> #include <linux/types.h>
#include <stdint.h> #include <stdint.h>

View File

@ -55,7 +55,7 @@ type Process struct {
// Wait releases any resources associated with the Process // Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) { func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil { if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.wait() return p.ops.wait()
} }
@ -65,7 +65,7 @@ func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value // math.MinInt32 is returned here, because it's invalid value
// for the kill() system call. // for the kill() system call.
if p.ops == nil { if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.pid(), nil return p.ops.pid(), nil
} }
@ -73,7 +73,7 @@ func (p Process) Pid() (int, error) {
// Signal sends a signal to the Process. // Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error { func (p Process) Signal(sig os.Signal) error {
if p.ops == nil { if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.signal(sig) return p.ops.signal(sig)
} }

View File

@ -5,6 +5,7 @@ package libcontainer
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"os" "os"
"os/exec" "os/exec"
@ -87,6 +88,7 @@ func (p *setnsProcess) start() (err error) {
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err) return newSystemError(err)
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemError(err)
} }
@ -96,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemError(err)
} }
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return newSystemError(ierr)
@ -199,7 +202,6 @@ func (p *initProcess) start() (err error) {
return newSystemError(err) return newSystemError(err)
} }
p.setExternalDescriptors(fds) p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children // Do this before syncing with child so that no children
// can escape the cgroup // can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil { if err := p.manager.Apply(p.pid()); err != nil {
@ -230,13 +232,54 @@ func (p *initProcess) start() (err error) {
if err := p.sendConfig(); err != nil { if err := p.sendConfig(); err != nil {
return newSystemError(err) return newSystemError(err)
} }
// wait for the child process to fully complete and receive an error message var (
// if one was encoutered procSync syncT
var ierr *genericError sentRun bool
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { ierr *genericError
)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync.Type {
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
}
sentRun = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemError(err)
} }
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait()
return newSystemError(ierr) return newSystemError(ierr)
} }
return nil return nil
@ -270,12 +313,10 @@ func (p *initProcess) startTime() (string, error) {
} }
func (p *initProcess) sendConfig() error { func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent // send the config to the container's init process, we don't use JSON Encode
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { // here because there might be a problem in JSON decoder in some cases, see:
return err // https://github.com/docker/docker/issues/14203#issuecomment-174177790
} return utils.WriteJSON(p.parentPipe, p.config)
// shutdown writes for the parent side of the pipe
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
} }
func (p *initProcess) createNetworkInterfaces() error { func (p *initProcess) createNetworkInterfaces() error {

View File

@ -18,6 +18,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
@ -293,12 +295,31 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function. // dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error { func checkMountDestination(rootfs, dest string) error {
if filepath.Clean(rootfs) == filepath.Clean(dest) { if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited") return fmt.Errorf("mounting into / is prohibited")
} }
invalidDestinations := []string{ invalidDestinations := []string{
"/proc", "/proc",
} }
// White list, it should be sub directories of invalid destinations
validDestinations := []string{
// These entries can be bind mounted by files emulated by fuse,
// so commands like top, free displays stats in container.
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stat",
"/proc/net/dev",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
if err != nil {
return err
}
if path == "." {
return nil
}
}
for _, invalid := range invalidDestinations { for _, invalid := range invalidDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
if err != nil { if err != nil {
@ -321,7 +342,7 @@ func setupDevSymlinks(rootfs string) error {
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc. // in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil { if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/kcore"}) links = append(links, [2]string{"/proc/kcore", "/dev/core"})
} }
for _, link := range links { for _, link := range links {
var ( var (
@ -365,11 +386,12 @@ func reOpenDevNull() error {
// Create the device nodes in the container. // Create the device nodes in the container.
func createDevices(config *configs.Config) error { func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := syscall.Umask(0000) oldMask := syscall.Umask(0000)
for _, node := range config.Devices { for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod // containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host. // devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil { if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
syscall.Umask(oldMask) syscall.Umask(oldMask)
return err return err
} }

View File

@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
} }
} }
func selinuxEnforcePath() string {
return fmt.Sprintf("%s/enforce", selinuxPath)
}
func SelinuxGetEnforce() int { func SelinuxGetEnforce() int {
var enforce int var enforce int
enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) enforceS, err := readCon(selinuxEnforcePath())
if err != nil { if err != nil {
return -1 return -1
} }
@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
return enforce return enforce
} }
func SelinuxSetEnforce(mode int) error {
return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
}
func SelinuxGetEnforceMode() int { func SelinuxGetEnforceMode() int {
switch readConfig(selinuxTag) { switch readConfig(selinuxTag) {
case "enforcing": case "enforcing":

View File

@ -6,6 +6,7 @@ import (
"os" "os"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
@ -18,12 +19,21 @@ type linuxSetnsInit struct {
} }
func (l *linuxSetnsInit) Init() error { func (l *linuxSetnsInit) Init() error {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring("_ses"); err != nil {
return err
}
if err := setupRlimits(l.config.Config); err != nil { if err := setupRlimits(l.config.Config); err != nil {
return err return err
} }
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err return err
} }
if l.config.Config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
if l.config.Config.Seccomp != nil { if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err

View File

@ -3,22 +3,41 @@
package libcontainer package libcontainer
import ( import (
"io"
"os" "os"
"syscall" "syscall"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
) )
type linuxStandardInit struct { type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int parentPid int
config *initConfig config *initConfig
} }
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
// the kernel
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error { func (l *linuxStandardInit) Init() error {
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring("")
if err != nil {
return err
}
// make session keyring searcheable
// without user ns we need 'UID' search permissions
// with user ns we need 'other' search permissions
if err := keyctl.ModKeyringPerm(sessKeyId, 0xffffffff, 0x080008); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided // join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
return err return err
@ -50,7 +69,6 @@ func (l *linuxStandardInit) Init() error {
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err return err
} }
label.Init() label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace // InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) { if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@ -75,7 +93,6 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
} }
for _, path := range l.config.Config.ReadonlyPaths { for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil { if err := remountReadonly(path); err != nil {
return err return err
@ -90,6 +107,17 @@ func (l *linuxStandardInit) Init() error {
if err != nil { if err != nil {
return err return err
} }
if l.config.Config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil { if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err
@ -109,5 +137,6 @@ func (l *linuxStandardInit) Init() error {
if syscall.Getppid() != l.parentPid { if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
} }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
} }

View File

@ -0,0 +1,226 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Destroyed
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
if err != nil {
return err
}
if !isRunning {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View File

@ -3,6 +3,9 @@
package system package system
import ( import (
"bufio"
"fmt"
"os"
"os/exec" "os/exec"
"syscall" "syscall"
"unsafe" "unsafe"
@ -75,3 +78,45 @@ func Setctty() error {
} }
return nil return nil
} }
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
return false
}
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if a == 0 && b == 0 && c == 4294967295 {
return false
}
return true
}
func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 {
err = e1
}
return
}

View File

@ -5,6 +5,7 @@ import (
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"io" "io"
"os"
"path/filepath" "path/filepath"
"syscall" "syscall"
) )
@ -54,3 +55,32 @@ func WriteJSON(w io.Writer, v interface{}) error {
_, err = w.Write(data) _, err = w.Write(data)
return err return err
} }
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}