mirror of https://github.com/docker/docs.git
vendor: update runc/libcontainer
This includes all of v0.0.8 as well as a few bug fixes that popped up during vendoring. Signed-off-by: Aleksa Sarai <asarai@suse.com>
This commit is contained in:
parent
7eed9a642e
commit
093dd39686
|
@ -59,7 +59,7 @@ clone git github.com/miekg/pkcs11 80f102b5cac759de406949c47f0928b99bd64cdf
|
||||||
clone git github.com/docker/go v1.5.1-1-1-gbaf439e
|
clone git github.com/docker/go v1.5.1-1-1-gbaf439e
|
||||||
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
|
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
|
||||||
|
|
||||||
clone git github.com/opencontainers/runc 3d8a20bb772defc28c355534d83486416d1719b4 # libcontainer
|
clone git github.com/opencontainers/runc ce72f86a2b54bc114d6ffb51f6500479b2d42154 # libcontainer
|
||||||
clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1
|
clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1
|
||||||
# libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
|
# libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
|
||||||
clone git github.com/coreos/go-systemd v4
|
clone git github.com/coreos/go-systemd v4
|
||||||
|
|
|
@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
|
||||||
|
|
||||||
#### Using libcontainer
|
#### Using libcontainer
|
||||||
|
|
||||||
To create a container you first have to initialize an instance of a factory
|
Because containers are spawned in a two step process you will need a binary that
|
||||||
that will handle the creation and initialization for a container.
|
will be executed as the init process for the container. In libcontainer, we use
|
||||||
|
the current binary (/proc/self/exe) to be executed as the init process, and use
|
||||||
Because containers are spawned in a two step process you will need to provide
|
arg "init", we call the first step process "bootstrap", so you always need a "init"
|
||||||
arguments to a binary that will be executed as the init process for the container.
|
function as the entry of "bootstrap".
|
||||||
To use the current binary that is spawning the containers and acting as the parent
|
|
||||||
you can use `os.Args[0]` and we have a command called `init` setup.
|
|
||||||
|
|
||||||
```go
|
```go
|
||||||
root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init"))
|
func init() {
|
||||||
|
if len(os.Args) > 1 && os.Args[1] == "init" {
|
||||||
|
runtime.GOMAXPROCS(1)
|
||||||
|
runtime.LockOSThread()
|
||||||
|
factory, _ := libcontainer.New("")
|
||||||
|
if err := factory.StartInitialization(); err != nil {
|
||||||
|
logrus.Fatal(err)
|
||||||
|
}
|
||||||
|
panic("--this line should have never been executed, congratulations--")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then to create a container you first have to initialize an instance of a factory
|
||||||
|
that will handle the creation and initialization for a container.
|
||||||
|
|
||||||
|
```go
|
||||||
|
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
logrus.Fatal(err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have an instance of the factory created we can create a configuration
|
Once you have an instance of the factory created we can create a configuration
|
||||||
struct describing how the container is to be created. A sample would look similar to this:
|
struct describing how the container is to be created. A sample would look similar to this:
|
||||||
|
|
||||||
```go
|
```go
|
||||||
|
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
|
||||||
config := &configs.Config{
|
config := &configs.Config{
|
||||||
Rootfs: rootfs,
|
Rootfs: "/your/path/to/rootfs",
|
||||||
Capabilities: []string{
|
Capabilities: []string{
|
||||||
"CAP_CHOWN",
|
"CAP_CHOWN",
|
||||||
"CAP_DAC_OVERRIDE",
|
"CAP_DAC_OVERRIDE",
|
||||||
"CAP_FSETID",
|
"CAP_FSETID",
|
||||||
"CAP_FOWNER",
|
"CAP_FOWNER",
|
||||||
"CAP_MKNOD",
|
"CAP_MKNOD",
|
||||||
"CAP_NET_RAW",
|
"CAP_NET_RAW",
|
||||||
"CAP_SETGID",
|
"CAP_SETGID",
|
||||||
"CAP_SETUID",
|
"CAP_SETUID",
|
||||||
"CAP_SETFCAP",
|
"CAP_SETFCAP",
|
||||||
"CAP_SETPCAP",
|
"CAP_SETPCAP",
|
||||||
"CAP_NET_BIND_SERVICE",
|
"CAP_NET_BIND_SERVICE",
|
||||||
"CAP_SYS_CHROOT",
|
"CAP_SYS_CHROOT",
|
||||||
"CAP_KILL",
|
"CAP_KILL",
|
||||||
"CAP_AUDIT_WRITE",
|
"CAP_AUDIT_WRITE",
|
||||||
},
|
},
|
||||||
Namespaces: configs.Namespaces([]configs.Namespace{
|
Namespaces: configs.Namespaces([]configs.Namespace{
|
||||||
{Type: configs.NEWNS},
|
{Type: configs.NEWNS},
|
||||||
{Type: configs.NEWUTS},
|
{Type: configs.NEWUTS},
|
||||||
{Type: configs.NEWIPC},
|
{Type: configs.NEWIPC},
|
||||||
{Type: configs.NEWPID},
|
{Type: configs.NEWPID},
|
||||||
{Type: configs.NEWNET},
|
{Type: configs.NEWUSER},
|
||||||
}),
|
{Type: configs.NEWNET},
|
||||||
Cgroups: &configs.Cgroup{
|
}),
|
||||||
Name: "test-container",
|
Cgroups: &configs.Cgroup{
|
||||||
Parent: "system",
|
Name: "test-container",
|
||||||
AllowAllDevices: false,
|
Parent: "system",
|
||||||
AllowedDevices: configs.DefaultAllowedDevices,
|
Resources: &configs.Resources{
|
||||||
},
|
MemorySwappiness: -1,
|
||||||
|
AllowAllDevices: false,
|
||||||
Devices: configs.DefaultAutoCreatedDevices,
|
AllowedDevices: configs.DefaultAllowedDevices,
|
||||||
Hostname: "testing",
|
},
|
||||||
Networks: []*configs.Network{
|
},
|
||||||
{
|
MaskPaths: []string{
|
||||||
Type: "loopback",
|
"/proc/kcore",
|
||||||
Address: "127.0.0.1/0",
|
},
|
||||||
Gateway: "localhost",
|
ReadonlyPaths: []string{
|
||||||
},
|
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
|
||||||
},
|
},
|
||||||
Rlimits: []configs.Rlimit{
|
Devices: configs.DefaultAutoCreatedDevices,
|
||||||
{
|
Hostname: "testing",
|
||||||
Type: syscall.RLIMIT_NOFILE,
|
Mounts: []*configs.Mount{
|
||||||
Hard: uint64(1024),
|
{
|
||||||
Soft: uint64(1024),
|
Source: "proc",
|
||||||
},
|
Destination: "/proc",
|
||||||
},
|
Device: "proc",
|
||||||
|
Flags: defaultMountFlags,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Source: "tmpfs",
|
||||||
|
Destination: "/dev",
|
||||||
|
Device: "tmpfs",
|
||||||
|
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
|
||||||
|
Data: "mode=755",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Source: "devpts",
|
||||||
|
Destination: "/dev/pts",
|
||||||
|
Device: "devpts",
|
||||||
|
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
|
||||||
|
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Device: "tmpfs",
|
||||||
|
Source: "shm",
|
||||||
|
Destination: "/dev/shm",
|
||||||
|
Data: "mode=1777,size=65536k",
|
||||||
|
Flags: defaultMountFlags,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Source: "mqueue",
|
||||||
|
Destination: "/dev/mqueue",
|
||||||
|
Device: "mqueue",
|
||||||
|
Flags: defaultMountFlags,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Source: "sysfs",
|
||||||
|
Destination: "/sys",
|
||||||
|
Device: "sysfs",
|
||||||
|
Flags: defaultMountFlags | syscall.MS_RDONLY,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
UidMappings: []configs.IDMap{
|
||||||
|
{
|
||||||
|
ContainerID: 0,
|
||||||
|
Host: 1000,
|
||||||
|
size: 65536,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
GidMappings: []configs.IDMap{
|
||||||
|
{
|
||||||
|
ContainerID: 0,
|
||||||
|
Host: 1000,
|
||||||
|
size: 65536,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Networks: []*configs.Network{
|
||||||
|
{
|
||||||
|
Type: "loopback",
|
||||||
|
Address: "127.0.0.1/0",
|
||||||
|
Gateway: "localhost",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Rlimits: []configs.Rlimit{
|
||||||
|
{
|
||||||
|
Type: syscall.RLIMIT_NOFILE,
|
||||||
|
Hard: uint64(1025),
|
||||||
|
Soft: uint64(1025),
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have the configuration populated you can create a container:
|
Once you have the configuration populated you can create a container:
|
||||||
|
|
||||||
```go
|
```go
|
||||||
container, err := root.Create("container-id", config)
|
container, err := factory.Create("container-id", config)
|
||||||
|
if err != nil {
|
||||||
|
logrus.Fatal(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
To spawn bash as the initial process inside the container and have the
|
To spawn bash as the initial process inside the container and have the
|
||||||
|
@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
|
||||||
|
|
||||||
```go
|
```go
|
||||||
process := &libcontainer.Process{
|
process := &libcontainer.Process{
|
||||||
Args: []string{"/bin/bash"},
|
Args: []string{"/bin/bash"},
|
||||||
Env: []string{"PATH=/bin"},
|
Env: []string{"PATH=/bin"},
|
||||||
User: "daemon",
|
User: "daemon",
|
||||||
Stdin: os.Stdin,
|
Stdin: os.Stdin,
|
||||||
Stdout: os.Stdout,
|
Stdout: os.Stdout,
|
||||||
Stderr: os.Stderr,
|
Stderr: os.Stderr,
|
||||||
}
|
}
|
||||||
|
|
||||||
err := container.Start(process)
|
err := container.Start(process)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
logrus.Fatal(err)
|
||||||
|
container.Destroy()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for the process to finish.
|
// wait for the process to finish.
|
||||||
status, err := process.Wait()
|
_, err := process.Wait()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
logrus.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// destroy the container.
|
// destroy the container.
|
||||||
|
@ -124,7 +211,6 @@ processes, err := container.Processes()
|
||||||
// it's processes.
|
// it's processes.
|
||||||
stats, err := container.Stats()
|
stats, err := container.Stats()
|
||||||
|
|
||||||
|
|
||||||
// pause all processes inside the container.
|
// pause all processes inside the container.
|
||||||
container.Pause()
|
container.Pause()
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
|
||||||
After a container's filesystems are mounted within the newly created
|
After a container's filesystems are mounted within the newly created
|
||||||
mount namespace `/dev` will need to be populated with a set of device nodes.
|
mount namespace `/dev` will need to be populated with a set of device nodes.
|
||||||
It is expected that a rootfs does not need to have any device nodes specified
|
It is expected that a rootfs does not need to have any device nodes specified
|
||||||
for `/dev` witin the rootfs as the container will setup the correct devices
|
for `/dev` within the rootfs as the container will setup the correct devices
|
||||||
that are required for executing a container's process.
|
that are required for executing a container's process.
|
||||||
|
|
||||||
| Path | Mode | Access |
|
| Path | Mode | Access |
|
||||||
|
@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
|
||||||
| perf_event | 1 |
|
| perf_event | 1 |
|
||||||
| freezer | 1 |
|
| freezer | 1 |
|
||||||
| hugetlb | 1 |
|
| hugetlb | 1 |
|
||||||
|
| pids | 1 |
|
||||||
|
|
||||||
|
|
||||||
All cgroup subsystem are joined so that statistics can be collected from
|
All cgroup subsystem are joined so that statistics can be collected from
|
||||||
|
@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
|
||||||
| CAP_SYS_BOOT | 0 |
|
| CAP_SYS_BOOT | 0 |
|
||||||
| CAP_LEASE | 0 |
|
| CAP_LEASE | 0 |
|
||||||
| CAP_WAKE_ALARM | 0 |
|
| CAP_WAKE_ALARM | 0 |
|
||||||
| CAP_BLOCK_SUSPE | 0 |
|
| CAP_BLOCK_SUSPEND | 0 |
|
||||||
|
|
||||||
|
|
||||||
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
|
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
|
||||||
|
|
|
@ -15,6 +15,9 @@ type Manager interface {
|
||||||
// Returns the PIDs inside the cgroup set
|
// Returns the PIDs inside the cgroup set
|
||||||
GetPids() ([]int, error)
|
GetPids() ([]int, error)
|
||||||
|
|
||||||
|
// Returns the PIDs inside the cgroup set & all sub-cgroups
|
||||||
|
GetAllPids() ([]int, error)
|
||||||
|
|
||||||
// Returns statistics for the cgroup set
|
// Returns statistics for the cgroup set
|
||||||
GetStats() (*Stats, error)
|
GetStats() (*Stats, error)
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -23,6 +24,7 @@ var (
|
||||||
&MemoryGroup{},
|
&MemoryGroup{},
|
||||||
&CpuGroup{},
|
&CpuGroup{},
|
||||||
&CpuacctGroup{},
|
&CpuacctGroup{},
|
||||||
|
&PidsGroup{},
|
||||||
&BlkioGroup{},
|
&BlkioGroup{},
|
||||||
&HugetlbGroup{},
|
&HugetlbGroup{},
|
||||||
&NetClsGroup{},
|
&NetClsGroup{},
|
||||||
|
@ -93,11 +95,10 @@ func getCgroupRoot() (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
type cgroupData struct {
|
type cgroupData struct {
|
||||||
root string
|
root string
|
||||||
parent string
|
innerPath string
|
||||||
name string
|
config *configs.Cgroup
|
||||||
config *configs.Cgroup
|
pid int
|
||||||
pid int
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Apply(pid int) (err error) {
|
func (m *Manager) Apply(pid int) (err error) {
|
||||||
|
@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if c.Paths != nil {
|
||||||
|
paths := make(map[string]string)
|
||||||
|
for name, path := range c.Paths {
|
||||||
|
_, err := d.path(name)
|
||||||
|
if err != nil {
|
||||||
|
if cgroups.IsNotFound(err) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
paths[name] = path
|
||||||
|
}
|
||||||
|
m.Paths = paths
|
||||||
|
return cgroups.EnterPid(m.Paths, pid)
|
||||||
|
}
|
||||||
|
|
||||||
paths := make(map[string]string)
|
paths := make(map[string]string)
|
||||||
defer func() {
|
defer func() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
|
||||||
paths[sys.Name()] = p
|
paths[sys.Name()] = p
|
||||||
}
|
}
|
||||||
m.Paths = paths
|
m.Paths = paths
|
||||||
|
|
||||||
if paths["cpu"] != "" {
|
|
||||||
if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Destroy() error {
|
func (m *Manager) Destroy() error {
|
||||||
|
if m.Cgroups.Paths != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
if err := cgroups.RemovePaths(m.Paths); err != nil {
|
if err := cgroups.RemovePaths(m.Paths); err != nil {
|
||||||
|
@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Set(container *configs.Config) error {
|
func (m *Manager) Set(container *configs.Config) error {
|
||||||
for name, path := range m.Paths {
|
for _, sys := range subsystems {
|
||||||
sys, err := subsystems.Get(name)
|
// Generate fake cgroup data.
|
||||||
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
|
d, err := getCgroupData(container.Cgroups, -1)
|
||||||
continue
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
// Get the path, but don't error out if the cgroup wasn't found.
|
||||||
|
path, err := d.path(sys.Name())
|
||||||
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
if err := sys.Set(path, container.Cgroups); err != nil {
|
if err := sys.Set(path, container.Cgroups); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if m.Paths["cpu"] != "" {
|
||||||
|
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -217,41 +243,28 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) GetPids() ([]int, error) {
|
func (m *Manager) GetPids() ([]int, error) {
|
||||||
d, err := getCgroupData(m.Cgroups, 0)
|
dir, err := getCgroupPath(m.Cgroups)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
dir, err := d.path("devices")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return cgroups.GetPids(dir)
|
return cgroups.GetPids(dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
// pathClean makes a path safe for use with filepath.Join. This is done by not
|
func (m *Manager) GetAllPids() ([]int, error) {
|
||||||
// only cleaning the path, but also (if the path is relative) adding a leading
|
dir, err := getCgroupPath(m.Cgroups)
|
||||||
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
if err != nil {
|
||||||
// path resulting from prepending another path will always resolve to lexically
|
return nil, err
|
||||||
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
}
|
||||||
// that include symlinks won't be safe as a result of using pathClean.
|
return cgroups.GetAllPids(dir)
|
||||||
func pathClean(path string) string {
|
}
|
||||||
// Ensure that all paths are cleaned (especially problematic ones like
|
|
||||||
// "/../../../../../" which can cause lots of issues).
|
|
||||||
path = filepath.Clean(path)
|
|
||||||
|
|
||||||
// If the path isn't absolute, we need to do more processing to fix paths
|
func getCgroupPath(c *configs.Cgroup) (string, error) {
|
||||||
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
d, err := getCgroupData(c, 0)
|
||||||
// paths to relative ones.
|
if err != nil {
|
||||||
if !filepath.IsAbs(path) {
|
return "", err
|
||||||
path = filepath.Clean(string(os.PathSeparator) + path)
|
|
||||||
// This can't fail, as (by definition) all paths are relative to root.
|
|
||||||
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean the path again for good measure.
|
return d.path("devices")
|
||||||
return filepath.Clean(path)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
|
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
|
||||||
|
@ -260,15 +273,25 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean the parent slice path.
|
if (c.Name != "" || c.Parent != "") && c.Path != "" {
|
||||||
c.Parent = pathClean(c.Parent)
|
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
|
||||||
|
}
|
||||||
|
|
||||||
|
// XXX: Do not remove this code. Path safety is important! -- cyphar
|
||||||
|
cgPath := libcontainerUtils.CleanPath(c.Path)
|
||||||
|
cgParent := libcontainerUtils.CleanPath(c.Parent)
|
||||||
|
cgName := libcontainerUtils.CleanPath(c.Name)
|
||||||
|
|
||||||
|
innerPath := cgPath
|
||||||
|
if innerPath == "" {
|
||||||
|
innerPath = filepath.Join(cgParent, cgName)
|
||||||
|
}
|
||||||
|
|
||||||
return &cgroupData{
|
return &cgroupData{
|
||||||
root: root,
|
root: root,
|
||||||
parent: c.Parent,
|
innerPath: innerPath,
|
||||||
name: c.Name,
|
config: c,
|
||||||
config: c,
|
pid: pid,
|
||||||
pid: pid,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
cgPath := filepath.Join(raw.parent, raw.name)
|
|
||||||
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
|
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
|
||||||
if filepath.IsAbs(cgPath) {
|
if filepath.IsAbs(raw.innerPath) {
|
||||||
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
|
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
|
||||||
return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
|
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
parentPath, err := raw.parentPath(subsystem, mnt, root)
|
parentPath, err := raw.parentPath(subsystem, mnt, root)
|
||||||
|
@ -308,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return filepath.Join(parentPath, cgPath), nil
|
return filepath.Join(parentPath, raw.innerPath), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (raw *cgroupData) join(subsystem string) (string, error) {
|
func (raw *cgroupData) join(subsystem string) (string, error) {
|
||||||
|
|
|
@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *BlkioGroup) Apply(d *cgroupData) error {
|
func (s *BlkioGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("blkio")
|
_, err := d.join("blkio")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string {
|
||||||
func (s *CpuGroup) Apply(d *cgroupData) error {
|
func (s *CpuGroup) Apply(d *cgroupData) error {
|
||||||
// We always want to join the cpu group, to allow fair cpu scheduling
|
// We always want to join the cpu group, to allow fair cpu scheduling
|
||||||
// on a container basis
|
// on a container basis
|
||||||
dir, err := d.join("cpu")
|
_, err := d.join("cpu")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type CpusetGroup struct {
|
type CpusetGroup struct {
|
||||||
|
@ -64,11 +65,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
|
||||||
if err := s.ensureParent(dir, root); err != nil {
|
if err := s.ensureParent(dir, root); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// the default values inherit from parent cgroup are already set in
|
|
||||||
// s.ensureParent, cover these if we have our own
|
|
||||||
if err := s.Set(dir, cgroup); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// because we are not using d.join we need to place the pid into the procs file
|
// because we are not using d.join we need to place the pid into the procs file
|
||||||
// unlike the other subsystems
|
// unlike the other subsystems
|
||||||
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
|
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
|
||||||
|
@ -93,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
|
||||||
// it's parent.
|
// it's parent.
|
||||||
func (s *CpusetGroup) ensureParent(current, root string) error {
|
func (s *CpusetGroup) ensureParent(current, root string) error {
|
||||||
parent := filepath.Dir(current)
|
parent := filepath.Dir(current)
|
||||||
if filepath.Clean(parent) == root {
|
if libcontainerUtils.CleanPath(parent) == root {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Avoid infinite recursion.
|
// Avoid infinite recursion.
|
||||||
|
|
|
@ -15,21 +15,29 @@ func (s *DevicesGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *DevicesGroup) Apply(d *cgroupData) error {
|
func (s *DevicesGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("devices")
|
_, err := d.join("devices")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// We will return error even it's `not found` error, devices
|
// We will return error even it's `not found` error, devices
|
||||||
// cgroup is hard requirement for container's security.
|
// cgroup is hard requirement for container's security.
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
|
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
|
||||||
|
devices := cgroup.Resources.Devices
|
||||||
|
if len(devices) > 0 {
|
||||||
|
for _, dev := range devices {
|
||||||
|
file := "devices.deny"
|
||||||
|
if dev.Allow {
|
||||||
|
file = "devices.allow"
|
||||||
|
}
|
||||||
|
if err := writeFile(path, file, dev.CgroupString()); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if !cgroup.Resources.AllowAllDevices {
|
if !cgroup.Resources.AllowAllDevices {
|
||||||
if err := writeFile(path, "devices.deny", "a"); err != nil {
|
if err := writeFile(path, "devices.deny", "a"); err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *FreezerGroup) Apply(d *cgroupData) error {
|
func (s *FreezerGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("freezer")
|
_, err := d.join("freezer")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *HugetlbGroup) Apply(d *cgroupData) error {
|
func (s *HugetlbGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("hugetlb")
|
_, err := d.join("hugetlb")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// We have to set kernel memory here, as we can't change it once
|
||||||
if err := s.Set(path, d.config); err != nil {
|
// processes have been attached.
|
||||||
|
if err := s.SetKernelMemory(path, d.config); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -50,7 +51,17 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
|
||||||
|
// This has to be done separately because it has special constraints (it
|
||||||
|
// can't be done after there are processes attached to the cgroup).
|
||||||
|
if cgroup.Resources.KernelMemory > 0 {
|
||||||
|
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,12 +81,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if cgroup.Resources.KernelMemory > 0 {
|
|
||||||
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cgroup.Resources.OomKillDisable {
|
if cgroup.Resources.OomKillDisable {
|
||||||
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
|
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -157,6 +162,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
|
||||||
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
|
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
|
||||||
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
|
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
|
||||||
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
|
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
|
||||||
|
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
|
||||||
|
|
||||||
value, err := getCgroupParamUint(path, usage)
|
value, err := getCgroupParamUint(path, usage)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -182,6 +188,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
|
||||||
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
|
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
|
||||||
}
|
}
|
||||||
memoryData.Failcnt = value
|
memoryData.Failcnt = value
|
||||||
|
value, err = getCgroupParamUint(path, limit)
|
||||||
|
if err != nil {
|
||||||
|
if moduleName != "memory" && os.IsNotExist(err) {
|
||||||
|
return cgroups.MemoryData{}, nil
|
||||||
|
}
|
||||||
|
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
|
||||||
|
}
|
||||||
|
memoryData.Limit = value
|
||||||
|
|
||||||
return memoryData, nil
|
return memoryData, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *NetClsGroup) Apply(d *cgroupData) error {
|
func (s *NetClsGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("net_cls")
|
_, err := d.join("net_cls")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *NetPrioGroup) Apply(d *cgroupData) error {
|
func (s *NetPrioGroup) Apply(d *cgroupData) error {
|
||||||
dir, err := d.join("net_prio")
|
_, err := d.join("net_prio")
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.Set(dir, d.config); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
// +build linux
|
||||||
|
|
||||||
|
package fs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
)
|
||||||
|
|
||||||
|
type PidsGroup struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PidsGroup) Name() string {
|
||||||
|
return "pids"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PidsGroup) Apply(d *cgroupData) error {
|
||||||
|
_, err := d.join("pids")
|
||||||
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
|
||||||
|
if cgroup.Resources.PidsLimit != 0 {
|
||||||
|
// "max" is the fallback value.
|
||||||
|
limit := "max"
|
||||||
|
|
||||||
|
if cgroup.Resources.PidsLimit > 0 {
|
||||||
|
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeFile(path, "pids.max", limit); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PidsGroup) Remove(d *cgroupData) error {
|
||||||
|
return removePath(d.path("pids"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
|
||||||
|
value, err := getCgroupParamUint(path, "pids.current")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to parse pids.current - %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.PidsStats.Current = value
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -36,7 +36,9 @@ type MemoryData struct {
|
||||||
Usage uint64 `json:"usage,omitempty"`
|
Usage uint64 `json:"usage,omitempty"`
|
||||||
MaxUsage uint64 `json:"max_usage,omitempty"`
|
MaxUsage uint64 `json:"max_usage,omitempty"`
|
||||||
Failcnt uint64 `json:"failcnt"`
|
Failcnt uint64 `json:"failcnt"`
|
||||||
|
Limit uint64 `json:"limit"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type MemoryStats struct {
|
type MemoryStats struct {
|
||||||
// memory used for cache
|
// memory used for cache
|
||||||
Cache uint64 `json:"cache,omitempty"`
|
Cache uint64 `json:"cache,omitempty"`
|
||||||
|
@ -49,6 +51,11 @@ type MemoryStats struct {
|
||||||
Stats map[string]uint64 `json:"stats,omitempty"`
|
Stats map[string]uint64 `json:"stats,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type PidsStats struct {
|
||||||
|
// number of pids in the cgroup
|
||||||
|
Current uint64 `json:"current,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type BlkioStatEntry struct {
|
type BlkioStatEntry struct {
|
||||||
Major uint64 `json:"major,omitempty"`
|
Major uint64 `json:"major,omitempty"`
|
||||||
Minor uint64 `json:"minor,omitempty"`
|
Minor uint64 `json:"minor,omitempty"`
|
||||||
|
@ -80,6 +87,7 @@ type HugetlbStats struct {
|
||||||
type Stats struct {
|
type Stats struct {
|
||||||
CpuStats CpuStats `json:"cpu_stats,omitempty"`
|
CpuStats CpuStats `json:"cpu_stats,omitempty"`
|
||||||
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
|
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
|
||||||
|
PidsStats PidsStats `json:"pids_stats,omitempty"`
|
||||||
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
|
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
|
||||||
// the map is in the format "size of hugepage: stats of the hugepage"
|
// the map is in the format "size of hugepage: stats of the hugepage"
|
||||||
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
|
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
|
||||||
|
|
|
@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
|
||||||
return nil, fmt.Errorf("Systemd not supported")
|
return nil, fmt.Errorf("Systemd not supported")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *Manager) GetAllPids() ([]int, error) {
|
||||||
|
return nil, fmt.Errorf("Systemd not supported")
|
||||||
|
}
|
||||||
|
|
||||||
func (m *Manager) Destroy() error {
|
func (m *Manager) Destroy() error {
|
||||||
return fmt.Errorf("Systemd not supported")
|
return fmt.Errorf("Systemd not supported")
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,6 +55,7 @@ var subsystems = subsystemSet{
|
||||||
&fs.MemoryGroup{},
|
&fs.MemoryGroup{},
|
||||||
&fs.CpuGroup{},
|
&fs.CpuGroup{},
|
||||||
&fs.CpuacctGroup{},
|
&fs.CpuacctGroup{},
|
||||||
|
&fs.PidsGroup{},
|
||||||
&fs.BlkioGroup{},
|
&fs.BlkioGroup{},
|
||||||
&fs.HugetlbGroup{},
|
&fs.HugetlbGroup{},
|
||||||
&fs.PerfEventGroup{},
|
&fs.PerfEventGroup{},
|
||||||
|
@ -167,6 +168,23 @@ func (m *Manager) Apply(pid int) error {
|
||||||
properties []systemdDbus.Property
|
properties []systemdDbus.Property
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if c.Paths != nil {
|
||||||
|
paths := make(map[string]string)
|
||||||
|
for name, path := range c.Paths {
|
||||||
|
_, err := getSubsystemPath(m.Cgroups, name)
|
||||||
|
if err != nil {
|
||||||
|
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
|
||||||
|
if cgroups.IsNotFound(err) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
paths[name] = path
|
||||||
|
}
|
||||||
|
m.Paths = paths
|
||||||
|
return cgroups.EnterPid(m.Paths, pid)
|
||||||
|
}
|
||||||
|
|
||||||
if c.Parent != "" {
|
if c.Parent != "" {
|
||||||
slice = c.Parent
|
slice = c.Parent
|
||||||
}
|
}
|
||||||
|
@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd
|
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
|
||||||
// because it does not currently support it via the dbus api.
|
// because it does not currently support it via the dbus api.
|
||||||
if err := joinFreezer(c, pid); err != nil {
|
if err := joinFreezer(c, pid); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := joinPids(c, pid); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
if err := joinCpuset(c, pid); err != nil {
|
if err := joinCpuset(c, pid); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
|
||||||
paths[s.Name()] = subsystemPath
|
paths[s.Name()] = subsystemPath
|
||||||
}
|
}
|
||||||
m.Paths = paths
|
m.Paths = paths
|
||||||
|
|
||||||
if paths["cpu"] != "" {
|
|
||||||
if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Destroy() error {
|
func (m *Manager) Destroy() error {
|
||||||
|
if m.Cgroups.Paths != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
|
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
|
||||||
|
@ -330,68 +348,74 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinCpu(c *configs.Cgroup, pid int) error {
|
func joinCpu(c *configs.Cgroup, pid int) error {
|
||||||
path, err := getSubsystemPath(c, "cpu")
|
_, err := join(c, "cpu", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if c.Resources.CpuQuota != 0 {
|
|
||||||
if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Resources.CpuPeriod != 0 {
|
|
||||||
if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Resources.CpuRtPeriod != 0 {
|
|
||||||
if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Resources.CpuRtRuntime != 0 {
|
|
||||||
if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinFreezer(c *configs.Cgroup, pid int) error {
|
func joinFreezer(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "freezer", pid)
|
_, err := join(c, "freezer", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
freezer, err := subsystems.Get("freezer")
|
return nil
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return freezer.Set(path, c)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinNetPrio(c *configs.Cgroup, pid int) error {
|
func joinNetPrio(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "net_prio", pid)
|
_, err := join(c, "net_prio", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
netPrio, err := subsystems.Get("net_prio")
|
return nil
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return netPrio.Set(path, c)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinNetCls(c *configs.Cgroup, pid int) error {
|
func joinNetCls(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "net_cls", pid)
|
_, err := join(c, "net_cls", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
netcls, err := subsystems.Get("net_cls")
|
return nil
|
||||||
if err != nil {
|
}
|
||||||
|
|
||||||
|
func joinPids(c *configs.Cgroup, pid int) error {
|
||||||
|
_, err := join(c, "pids", pid)
|
||||||
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return netcls.Set(path, c)
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// systemd represents slice heirarchy using `-`, so we need to follow suit when
|
||||||
|
// generating the path of slice. Essentially, test-a-b.slice becomes
|
||||||
|
// test.slice/test-a.slice/test-a-b.slice.
|
||||||
|
func expandSlice(slice string) (string, error) {
|
||||||
|
suffix := ".slice"
|
||||||
|
// Name has to end with ".slice", but can't be just ".slice".
|
||||||
|
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
|
||||||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Path-separators are not allowed.
|
||||||
|
if strings.Contains(slice, "/") {
|
||||||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||||||
|
}
|
||||||
|
|
||||||
|
var path, prefix string
|
||||||
|
sliceName := strings.TrimSuffix(slice, suffix)
|
||||||
|
for _, component := range strings.Split(sliceName, "-") {
|
||||||
|
// test--a.slice isn't permitted, nor is -test.slice.
|
||||||
|
if component == "" {
|
||||||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append the component to the path and to the prefix.
|
||||||
|
path += prefix + component + suffix + "/"
|
||||||
|
prefix += component + "-"
|
||||||
|
}
|
||||||
|
|
||||||
|
return path, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
|
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
|
||||||
|
@ -410,6 +434,11 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
|
||||||
slice = c.Parent
|
slice = c.Parent
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slice, err = expandSlice(slice)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
|
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -440,6 +469,14 @@ func (m *Manager) GetPids() ([]int, error) {
|
||||||
return cgroups.GetPids(path)
|
return cgroups.GetPids(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *Manager) GetAllPids() ([]int, error) {
|
||||||
|
path, err := getSubsystemPath(m.Cgroups, "devices")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return cgroups.GetAllPids(path)
|
||||||
|
}
|
||||||
|
|
||||||
func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
defer m.mu.Unlock()
|
defer m.mu.Unlock()
|
||||||
|
@ -458,16 +495,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Set(container *configs.Config) error {
|
func (m *Manager) Set(container *configs.Config) error {
|
||||||
for name, path := range m.Paths {
|
for _, sys := range subsystems {
|
||||||
sys, err := subsystems.Get(name)
|
// Get the subsystem path, but don't error out for not found cgroups.
|
||||||
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
|
path, err := getSubsystemPath(container.Cgroups, sys.Name())
|
||||||
continue
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := sys.Set(path, container.Cgroups); err != nil {
|
if err := sys.Set(path, container.Cgroups); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if m.Paths["cpu"] != "" {
|
||||||
|
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -487,17 +531,13 @@ func getUnitName(c *configs.Cgroup) string {
|
||||||
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
|
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
|
||||||
// This happens at least for v208 when any sibling unit is started.
|
// This happens at least for v208 when any sibling unit is started.
|
||||||
func joinDevices(c *configs.Cgroup, pid int) error {
|
func joinDevices(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "devices", pid)
|
_, err := join(c, "devices", pid)
|
||||||
// Even if it's `not found` error, we'll return err because devices cgroup
|
// Even if it's `not found` error, we'll return err because devices cgroup
|
||||||
// is hard requirement for container security.
|
// is hard requirement for container security.
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
devices, err := subsystems.Get("devices")
|
return nil
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return devices.Set(path, c)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func setKernelMemory(c *configs.Cgroup) error {
|
func setKernelMemory(c *configs.Cgroup) error {
|
||||||
|
@ -510,52 +550,16 @@ func setKernelMemory(c *configs.Cgroup) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.Resources.KernelMemory > 0 {
|
// This doesn't get called by manager.Set, so we need to do it here.
|
||||||
err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10))
|
s := &fs.MemoryGroup{}
|
||||||
if err != nil {
|
return s.SetKernelMemory(path, c)
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinMemory(c *configs.Cgroup, pid int) error {
|
func joinMemory(c *configs.Cgroup, pid int) error {
|
||||||
path, err := getSubsystemPath(c, "memory")
|
_, err := join(c, "memory", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// -1 disables memoryswap
|
|
||||||
if c.Resources.MemorySwap > 0 {
|
|
||||||
err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Resources.MemoryReservation > 0 {
|
|
||||||
err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if c.Resources.OomKillDisable {
|
|
||||||
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 {
|
|
||||||
err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
} else if c.Resources.MemorySwappiness == -1 {
|
|
||||||
return nil
|
|
||||||
} else {
|
|
||||||
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.Resources.MemorySwappiness)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -577,68 +581,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
|
||||||
// expects device path instead of major minor numbers, which is also confusing
|
// expects device path instead of major minor numbers, which is also confusing
|
||||||
// for users. So we use fs work around for now.
|
// for users. So we use fs work around for now.
|
||||||
func joinBlkio(c *configs.Cgroup, pid int) error {
|
func joinBlkio(c *configs.Cgroup, pid int) error {
|
||||||
path, err := getSubsystemPath(c, "blkio")
|
_, err := join(c, "blkio", pid)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// systemd doesn't directly support this in the dbus properties
|
|
||||||
if c.Resources.BlkioLeafWeight != 0 {
|
|
||||||
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, wd := range c.Resources.BlkioWeightDevice {
|
|
||||||
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, td := range c.Resources.BlkioThrottleReadBpsDevice {
|
|
||||||
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, td := range c.Resources.BlkioThrottleWriteBpsDevice {
|
|
||||||
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, td := range c.Resources.BlkioThrottleReadIOPSDevice {
|
|
||||||
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice {
|
|
||||||
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinHugetlb(c *configs.Cgroup, pid int) error {
|
func joinHugetlb(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "hugetlb", pid)
|
_, err := join(c, "hugetlb", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
hugetlb, err := subsystems.Get("hugetlb")
|
return nil
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return hugetlb.Set(path, c)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func joinPerfEvent(c *configs.Cgroup, pid int) error {
|
func joinPerfEvent(c *configs.Cgroup, pid int) error {
|
||||||
path, err := join(c, "perf_event", pid)
|
_, err := join(c, "perf_event", pid)
|
||||||
if err != nil && !cgroups.IsNotFound(err) {
|
if err != nil && !cgroups.IsNotFound(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
perfEvent, err := subsystems.Get("perf_event")
|
return nil
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return perfEvent.Set(path, c)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ package cgroups
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -12,7 +13,6 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/docker/docker/pkg/mount"
|
|
||||||
"github.com/docker/go-units"
|
"github.com/docker/go-units"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
|
||||||
// Safe as mountinfo encodes mountpoints with spaces as \040.
|
// Safe as mountinfo encodes mountpoints with spaces as \040.
|
||||||
index := strings.Index(text, " - ")
|
index := strings.Index(text, " - ")
|
||||||
postSeparatorFields := strings.Fields(text[index+3:])
|
postSeparatorFields := strings.Fields(text[index+3:])
|
||||||
if len(postSeparatorFields) < 3 {
|
numPostFields := len(postSeparatorFields)
|
||||||
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
|
||||||
|
// This is an error as we can't detect if the mount is for "cgroup"
|
||||||
|
if numPostFields == 0 {
|
||||||
|
return "", fmt.Errorf("Found no fields post '-' in %q", text)
|
||||||
}
|
}
|
||||||
|
|
||||||
if postSeparatorFields[0] == "cgroup" {
|
if postSeparatorFields[0] == "cgroup" {
|
||||||
|
// Check that the mount is properly formated.
|
||||||
|
if numPostFields < 3 {
|
||||||
|
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
||||||
|
}
|
||||||
|
|
||||||
return filepath.Dir(fields[4]), nil
|
return filepath.Dir(fields[4]), nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -112,11 +121,45 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
|
||||||
return getControllerPath(m.Subsystems[0], cgroups)
|
return getControllerPath(m.Subsystems[0], cgroups)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
|
||||||
|
res := make([]Mount, 0, len(ss))
|
||||||
|
scanner := bufio.NewScanner(mi)
|
||||||
|
for scanner.Scan() {
|
||||||
|
txt := scanner.Text()
|
||||||
|
sepIdx := strings.IndexByte(txt, '-')
|
||||||
|
if sepIdx == -1 {
|
||||||
|
return nil, fmt.Errorf("invalid mountinfo format")
|
||||||
|
}
|
||||||
|
if txt[sepIdx+2:sepIdx+8] != "cgroup" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := strings.Split(txt, " ")
|
||||||
|
m := Mount{
|
||||||
|
Mountpoint: fields[4],
|
||||||
|
Root: fields[3],
|
||||||
|
}
|
||||||
|
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
|
||||||
|
if strings.HasPrefix(opt, cgroupNamePrefix) {
|
||||||
|
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
|
||||||
|
}
|
||||||
|
if ss[opt] {
|
||||||
|
m.Subsystems = append(m.Subsystems, opt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res = append(res, m)
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
|
|
||||||
func GetCgroupMounts() ([]Mount, error) {
|
func GetCgroupMounts() ([]Mount, error) {
|
||||||
mounts, err := mount.GetMounts()
|
f, err := os.Open("/proc/self/mountinfo")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
all, err := GetAllSubsystems()
|
all, err := GetAllSubsystems()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -127,24 +170,7 @@ func GetCgroupMounts() ([]Mount, error) {
|
||||||
for _, s := range all {
|
for _, s := range all {
|
||||||
allMap[s] = true
|
allMap[s] = true
|
||||||
}
|
}
|
||||||
|
return getCgroupMountsHelper(allMap, f)
|
||||||
res := []Mount{}
|
|
||||||
for _, mount := range mounts {
|
|
||||||
if mount.Fstype == "cgroup" {
|
|
||||||
m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
|
|
||||||
|
|
||||||
for _, opt := range strings.Split(mount.VfsOpts, ",") {
|
|
||||||
if strings.HasPrefix(opt, cgroupNamePrefix) {
|
|
||||||
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
|
|
||||||
}
|
|
||||||
if allMap[opt] {
|
|
||||||
m.Subsystems = append(m.Subsystems, opt)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res = append(res, m)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns all the cgroup subsystems supported by the kernel
|
// Returns all the cgroup subsystems supported by the kernel
|
||||||
|
@ -323,9 +349,14 @@ func GetHugePageSize() ([]string, error) {
|
||||||
return pageSizes, nil
|
return pageSizes, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetPids returns all pids, that were added to cgroup at path and to all its
|
// GetPids returns all pids, that were added to cgroup at path.
|
||||||
// subcgroups.
|
|
||||||
func GetPids(path string) ([]int, error) {
|
func GetPids(path string) ([]int, error) {
|
||||||
|
return readProcsFile(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllPids returns all pids, that were added to cgroup at path and to all its
|
||||||
|
// subcgroups.
|
||||||
|
func GetAllPids(path string) ([]int, error) {
|
||||||
var pids []int
|
var pids []int
|
||||||
// collect pids from all sub-cgroups
|
// collect pids from all sub-cgroups
|
||||||
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
|
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
|
||||||
|
|
|
@ -11,25 +11,38 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
type Cgroup struct {
|
type Cgroup struct {
|
||||||
Name string `json:"name"`
|
// Deprecated, use Path instead
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
|
||||||
// name of parent cgroup or slice
|
// name of parent of cgroup or slice
|
||||||
Parent string `json:"parent"`
|
// Deprecated, use Path instead
|
||||||
|
Parent string `json:"parent,omitempty"`
|
||||||
|
|
||||||
|
// Path specifies the path to cgroups that are created and/or joined by the container.
|
||||||
|
// The path is assumed to be relative to the host system cgroup mountpoint.
|
||||||
|
Path string `json:"path"`
|
||||||
|
|
||||||
// ScopePrefix decribes prefix for the scope name
|
// ScopePrefix decribes prefix for the scope name
|
||||||
ScopePrefix string `json:"scope_prefix"`
|
ScopePrefix string `json:"scope_prefix"`
|
||||||
|
|
||||||
|
// Paths represent the absolute cgroups paths to join.
|
||||||
|
// This takes precedence over Path.
|
||||||
|
Paths map[string]string
|
||||||
|
|
||||||
// Resources contains various cgroups settings to apply
|
// Resources contains various cgroups settings to apply
|
||||||
*Resources
|
*Resources
|
||||||
}
|
}
|
||||||
|
|
||||||
type Resources struct {
|
type Resources struct {
|
||||||
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
|
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
|
||||||
AllowAllDevices bool `json:"allow_all_devices"`
|
// Deprecated
|
||||||
|
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
|
||||||
|
// Deprecated
|
||||||
|
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
|
||||||
|
// Deprecated
|
||||||
|
DeniedDevices []*Device `json:"denied_devices,omitempty"`
|
||||||
|
|
||||||
AllowedDevices []*Device `json:"allowed_devices"`
|
Devices []*Device `json:"devices"`
|
||||||
|
|
||||||
DeniedDevices []*Device `json:"denied_devices"`
|
|
||||||
|
|
||||||
// Memory limit (in bytes)
|
// Memory limit (in bytes)
|
||||||
Memory int64 `json:"memory"`
|
Memory int64 `json:"memory"`
|
||||||
|
@ -37,7 +50,7 @@ type Resources struct {
|
||||||
// Memory reservation or soft_limit (in bytes)
|
// Memory reservation or soft_limit (in bytes)
|
||||||
MemoryReservation int64 `json:"memory_reservation"`
|
MemoryReservation int64 `json:"memory_reservation"`
|
||||||
|
|
||||||
// Total memory usage (memory + swap); set `-1' to disable swap
|
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
|
||||||
MemorySwap int64 `json:"memory_swap"`
|
MemorySwap int64 `json:"memory_swap"`
|
||||||
|
|
||||||
// Kernel memory limit (in bytes)
|
// Kernel memory limit (in bytes)
|
||||||
|
@ -64,6 +77,9 @@ type Resources struct {
|
||||||
// MEM to use
|
// MEM to use
|
||||||
CpusetMems string `json:"cpuset_mems"`
|
CpusetMems string `json:"cpuset_mems"`
|
||||||
|
|
||||||
|
// Process limit; set <= `0' to disable limit.
|
||||||
|
PidsLimit int64 `json:"pids_limit"`
|
||||||
|
|
||||||
// Specifies per cgroup weight, range is from 10 to 1000.
|
// Specifies per cgroup weight, range is from 10 to 1000.
|
||||||
BlkioWeight uint16 `json:"blkio_weight"`
|
BlkioWeight uint16 `json:"blkio_weight"`
|
||||||
|
|
||||||
|
|
|
@ -171,6 +171,9 @@ type Config struct {
|
||||||
// A default action to be taken if no rules match is also given.
|
// A default action to be taken if no rules match is also given.
|
||||||
Seccomp *Seccomp `json:"seccomp"`
|
Seccomp *Seccomp `json:"seccomp"`
|
||||||
|
|
||||||
|
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
|
||||||
|
NoNewPrivileges bool `json:"no_new_privileges"`
|
||||||
|
|
||||||
// Hooks are a collection of actions to perform at various container lifecycle events.
|
// Hooks are a collection of actions to perform at various container lifecycle events.
|
||||||
// Hooks are not able to be marshaled to json but they are also not needed to.
|
// Hooks are not able to be marshaled to json but they are also not needed to.
|
||||||
Hooks *Hooks `json:"-"`
|
Hooks *Hooks `json:"-"`
|
||||||
|
|
|
@ -35,6 +35,9 @@ type Device struct {
|
||||||
|
|
||||||
// Gid of the device.
|
// Gid of the device.
|
||||||
Gid uint32 `json:"gid"`
|
Gid uint32 `json:"gid"`
|
||||||
|
|
||||||
|
// Write the file to the allowed list
|
||||||
|
Allow bool `json:"allow"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Device) CgroupString() string {
|
func (d *Device) CgroupString() string {
|
||||||
|
|
|
@ -82,20 +82,6 @@ var (
|
||||||
Minor: 1,
|
Minor: 1,
|
||||||
Permissions: "rwm",
|
Permissions: "rwm",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
Path: "/dev/tty0",
|
|
||||||
Type: 'c',
|
|
||||||
Major: 4,
|
|
||||||
Minor: 0,
|
|
||||||
Permissions: "rwm",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Path: "/dev/tty1",
|
|
||||||
Type: 'c',
|
|
||||||
Major: 4,
|
|
||||||
Minor: 1,
|
|
||||||
Permissions: "rwm",
|
|
||||||
},
|
|
||||||
// /dev/pts/ - pts namespaces are "coming soon"
|
// /dev/pts/ - pts namespaces are "coming soon"
|
||||||
{
|
{
|
||||||
Path: "",
|
Path: "",
|
||||||
|
|
|
@ -6,6 +6,7 @@ package libcontainer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
)
|
)
|
||||||
|
@ -14,8 +15,11 @@ import (
|
||||||
type Status int
|
type Status int
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
// The container exists but has not been run yet
|
||||||
|
Created Status = iota
|
||||||
|
|
||||||
// The container exists and is running.
|
// The container exists and is running.
|
||||||
Running Status = iota + 1
|
Running
|
||||||
|
|
||||||
// The container exists, it is in the process of being paused.
|
// The container exists, it is in the process of being paused.
|
||||||
Pausing
|
Pausing
|
||||||
|
@ -30,6 +34,25 @@ const (
|
||||||
Destroyed
|
Destroyed
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func (s Status) String() string {
|
||||||
|
switch s {
|
||||||
|
case Created:
|
||||||
|
return "created"
|
||||||
|
case Running:
|
||||||
|
return "running"
|
||||||
|
case Pausing:
|
||||||
|
return "pausing"
|
||||||
|
case Paused:
|
||||||
|
return "paused"
|
||||||
|
case Checkpointed:
|
||||||
|
return "checkpointed"
|
||||||
|
case Destroyed:
|
||||||
|
return "destroyed"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// BaseState represents the platform agnostic pieces relating to a
|
// BaseState represents the platform agnostic pieces relating to a
|
||||||
// running container's state
|
// running container's state
|
||||||
type BaseState struct {
|
type BaseState struct {
|
||||||
|
@ -39,9 +62,12 @@ type BaseState struct {
|
||||||
// InitProcessPid is the init process id in the parent namespace.
|
// InitProcessPid is the init process id in the parent namespace.
|
||||||
InitProcessPid int `json:"init_process_pid"`
|
InitProcessPid int `json:"init_process_pid"`
|
||||||
|
|
||||||
// InitProcessStartTime is the init process start time.
|
// InitProcessStartTime is the init process start time in clock cycles since boot time.
|
||||||
InitProcessStartTime string `json:"init_process_start"`
|
InitProcessStartTime string `json:"init_process_start"`
|
||||||
|
|
||||||
|
// Created is the unix timestamp for the creation time of the container in UTC
|
||||||
|
Created time.Time `json:"created"`
|
||||||
|
|
||||||
// Config is the container's configuration.
|
// Config is the container's configuration.
|
||||||
Config configs.Config `json:"config"`
|
Config configs.Config `json:"config"`
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Sirupsen/logrus"
|
"github.com/Sirupsen/logrus"
|
||||||
"github.com/golang/protobuf/proto"
|
"github.com/golang/protobuf/proto"
|
||||||
|
@ -38,6 +39,8 @@ type linuxContainer struct {
|
||||||
criuPath string
|
criuPath string
|
||||||
m sync.Mutex
|
m sync.Mutex
|
||||||
criuVersion int
|
criuVersion int
|
||||||
|
state containerState
|
||||||
|
created time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
// State represents a running container's state
|
// State represents a running container's state
|
||||||
|
@ -104,6 +107,12 @@ type Container interface {
|
||||||
// errors:
|
// errors:
|
||||||
// Systemerror - System error.
|
// Systemerror - System error.
|
||||||
NotifyOOM() (<-chan struct{}, error)
|
NotifyOOM() (<-chan struct{}, error)
|
||||||
|
|
||||||
|
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
|
||||||
|
//
|
||||||
|
// errors:
|
||||||
|
// Systemerror - System error.
|
||||||
|
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ID returns the container's unique ID
|
// ID returns the container's unique ID
|
||||||
|
@ -129,7 +138,7 @@ func (c *linuxContainer) State() (*State, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) Processes() ([]int, error) {
|
func (c *linuxContainer) Processes() ([]int, error) {
|
||||||
pids, err := c.cgroupManager.GetPids()
|
pids, err := c.cgroupManager.GetAllPids()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, newSystemError(err)
|
return nil, newSystemError(err)
|
||||||
}
|
}
|
||||||
|
@ -183,22 +192,30 @@ func (c *linuxContainer) Start(process *Process) error {
|
||||||
}
|
}
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
if doInit {
|
// generate a timestamp indicating when the container was started
|
||||||
c.updateState(parent)
|
c.created = time.Now().UTC()
|
||||||
|
|
||||||
|
c.state = &runningState{
|
||||||
|
c: c,
|
||||||
}
|
}
|
||||||
if c.config.Hooks != nil {
|
if doInit {
|
||||||
s := configs.HookState{
|
if err := c.updateState(parent); err != nil {
|
||||||
Version: c.config.Version,
|
return err
|
||||||
ID: c.id,
|
|
||||||
Pid: parent.pid(),
|
|
||||||
Root: c.config.Rootfs,
|
|
||||||
}
|
}
|
||||||
for _, hook := range c.config.Hooks.Poststart {
|
if c.config.Hooks != nil {
|
||||||
if err := hook.Run(s); err != nil {
|
s := configs.HookState{
|
||||||
if err := parent.terminate(); err != nil {
|
Version: c.config.Version,
|
||||||
logrus.Warn(err)
|
ID: c.id,
|
||||||
|
Pid: parent.pid(),
|
||||||
|
Root: c.config.Rootfs,
|
||||||
|
}
|
||||||
|
for _, hook := range c.config.Hooks.Poststart {
|
||||||
|
if err := hook.Run(s); err != nil {
|
||||||
|
if err := parent.terminate(); err != nil {
|
||||||
|
logrus.Warn(err)
|
||||||
|
}
|
||||||
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
return newSystemError(err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -251,7 +268,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
|
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
|
||||||
t := "_LIBCONTAINER_INITTYPE=standard"
|
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
|
||||||
cloneFlags := c.config.Namespaces.CloneFlags()
|
cloneFlags := c.config.Namespaces.CloneFlags()
|
||||||
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
|
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
|
||||||
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
|
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
|
||||||
|
@ -278,7 +295,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
||||||
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns")
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
|
||||||
// for setns process, we dont have to set cloneflags as the process namespaces
|
// for setns process, we dont have to set cloneflags as the process namespaces
|
||||||
// will only be set via setns syscall
|
// will only be set via setns syscall
|
||||||
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
|
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
|
||||||
|
@ -321,54 +338,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
|
||||||
func (c *linuxContainer) Destroy() error {
|
func (c *linuxContainer) Destroy() error {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
status, err := c.currentStatus()
|
return c.state.destroy()
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if status != Destroyed {
|
|
||||||
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
|
|
||||||
}
|
|
||||||
if !c.config.Namespaces.Contains(configs.NEWPID) {
|
|
||||||
if err := killCgroupProcesses(c.cgroupManager); err != nil {
|
|
||||||
logrus.Warn(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
err = c.cgroupManager.Destroy()
|
|
||||||
if rerr := os.RemoveAll(c.root); err == nil {
|
|
||||||
err = rerr
|
|
||||||
}
|
|
||||||
c.initProcess = nil
|
|
||||||
if c.config.Hooks != nil {
|
|
||||||
s := configs.HookState{
|
|
||||||
Version: c.config.Version,
|
|
||||||
ID: c.id,
|
|
||||||
Root: c.config.Rootfs,
|
|
||||||
}
|
|
||||||
for _, hook := range c.config.Hooks.Poststop {
|
|
||||||
if err := hook.Run(s); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) Pause() error {
|
func (c *linuxContainer) Pause() error {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
return c.cgroupManager.Freeze(configs.Frozen)
|
status, err := c.currentStatus()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if status != Running {
|
||||||
|
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
|
||||||
|
}
|
||||||
|
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return c.state.transition(&pausedState{
|
||||||
|
c: c,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) Resume() error {
|
func (c *linuxContainer) Resume() error {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
return c.cgroupManager.Freeze(configs.Thawed)
|
status, err := c.currentStatus()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if status != Paused {
|
||||||
|
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
|
||||||
|
}
|
||||||
|
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return c.state.transition(&runningState{
|
||||||
|
c: c,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
||||||
return notifyOnOOM(c.cgroupManager.GetPaths())
|
return notifyOnOOM(c.cgroupManager.GetPaths())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
|
||||||
|
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
|
||||||
|
}
|
||||||
|
|
||||||
// XXX debug support, remove when debugging done.
|
// XXX debug support, remove when debugging done.
|
||||||
func addArgsFromEnv(evar string, args *[]string) {
|
func addArgsFromEnv(evar string, args *[]string) {
|
||||||
if e := os.Getenv(evar); e != "" {
|
if e := os.Getenv(evar); e != "" {
|
||||||
|
@ -460,7 +476,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if criuOpts.ImagesDirectory == "" {
|
if criuOpts.ImagesDirectory == "" {
|
||||||
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
|
return fmt.Errorf("invalid directory to save checkpoint")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Since a container can be C/R'ed multiple times,
|
// Since a container can be C/R'ed multiple times,
|
||||||
|
@ -579,11 +595,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
|
||||||
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
|
|
||||||
if err := c.checkCriuVersion("1.5.2"); err != nil {
|
if err := c.checkCriuVersion("1.5.2"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if criuOpts.WorkDirectory == "" {
|
if criuOpts.WorkDirectory == "" {
|
||||||
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
||||||
}
|
}
|
||||||
|
@ -592,22 +606,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
|
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
workDir, err := os.Open(criuOpts.WorkDirectory)
|
workDir, err := os.Open(criuOpts.WorkDirectory)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer workDir.Close()
|
defer workDir.Close()
|
||||||
|
|
||||||
if criuOpts.ImagesDirectory == "" {
|
if criuOpts.ImagesDirectory == "" {
|
||||||
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
|
return fmt.Errorf("invalid directory to restore checkpoint")
|
||||||
}
|
}
|
||||||
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer imageDir.Close()
|
defer imageDir.Close()
|
||||||
|
|
||||||
// CRIU has a few requirements for a root directory:
|
// CRIU has a few requirements for a root directory:
|
||||||
// * it must be a mount point
|
// * it must be a mount point
|
||||||
// * its parent must not be overmounted
|
// * its parent must not be overmounted
|
||||||
|
@ -618,18 +629,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer os.Remove(root)
|
defer os.Remove(root)
|
||||||
|
|
||||||
root, err = filepath.EvalSymlinks(root)
|
root, err = filepath.EvalSymlinks(root)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
|
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer syscall.Unmount(root, syscall.MNT_DETACH)
|
defer syscall.Unmount(root, syscall.MNT_DETACH)
|
||||||
|
|
||||||
t := criurpc.CriuReqType_RESTORE
|
t := criurpc.CriuReqType_RESTORE
|
||||||
req := &criurpc.CriuReq{
|
req := &criurpc.CriuReq{
|
||||||
Type: &t,
|
Type: &t,
|
||||||
|
@ -697,15 +705,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
fds []string
|
fds []string
|
||||||
fdJSON []byte
|
fdJSON []byte
|
||||||
)
|
)
|
||||||
|
|
||||||
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err = json.Unmarshal(fdJSON, &fds); err != nil {
|
if err := json.Unmarshal(fdJSON, &fds); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range fds {
|
for i := range fds {
|
||||||
if s := fds[i]; strings.Contains(s, "pipe:") {
|
if s := fds[i]; strings.Contains(s, "pipe:") {
|
||||||
inheritFd := new(criurpc.InheritFd)
|
inheritFd := new(criurpc.InheritFd)
|
||||||
|
@ -714,12 +720,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
|
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return c.criuSwrk(process, req, criuOpts, true)
|
||||||
err = c.criuSwrk(process, req, criuOpts, true)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
||||||
|
@ -914,46 +915,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
|
||||||
if notify == nil {
|
if notify == nil {
|
||||||
return fmt.Errorf("invalid response: %s", resp.String())
|
return fmt.Errorf("invalid response: %s", resp.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case notify.GetScript() == "post-dump":
|
case notify.GetScript() == "post-dump":
|
||||||
if !opts.LeaveRunning {
|
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
|
||||||
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
|
if err != nil {
|
||||||
if err != nil {
|
return err
|
||||||
return err
|
|
||||||
}
|
|
||||||
f.Close()
|
|
||||||
}
|
}
|
||||||
break
|
f.Close()
|
||||||
|
|
||||||
case notify.GetScript() == "network-unlock":
|
case notify.GetScript() == "network-unlock":
|
||||||
if err := unlockNetwork(c.config); err != nil {
|
if err := unlockNetwork(c.config); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
break
|
|
||||||
|
|
||||||
case notify.GetScript() == "network-lock":
|
case notify.GetScript() == "network-lock":
|
||||||
if err := lockNetwork(c.config); err != nil {
|
if err := lockNetwork(c.config); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
break
|
|
||||||
|
|
||||||
case notify.GetScript() == "post-restore":
|
case notify.GetScript() == "post-restore":
|
||||||
pid := notify.GetPid()
|
pid := notify.GetPid()
|
||||||
r, err := newRestoredProcess(int(pid), fds)
|
r, err := newRestoredProcess(int(pid), fds)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
process.ops = r
|
||||||
// TODO: crosbymichael restore previous process information by saving the init process information in
|
if err := c.state.transition(&restoredState{
|
||||||
// the container's state file or separate process state files.
|
imageDir: opts.ImagesDirectory,
|
||||||
|
c: c,
|
||||||
|
}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if err := c.updateState(r); err != nil {
|
if err := c.updateState(r); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
process.ops = r
|
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
|
||||||
break
|
if !os.IsNotExist(err) {
|
||||||
|
logrus.Error(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -963,65 +961,108 @@ func (c *linuxContainer) updateState(process parentProcess) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
return c.saveState(state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *linuxContainer) saveState(s *State) error {
|
||||||
f, err := os.Create(filepath.Join(c.root, stateFilename))
|
f, err := os.Create(filepath.Join(c.root, stateFilename))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
os.Remove(filepath.Join(c.root, "checkpoint"))
|
return utils.WriteJSON(f, s)
|
||||||
return utils.WriteJSON(f, state)
|
}
|
||||||
|
|
||||||
|
func (c *linuxContainer) deleteState() error {
|
||||||
|
return os.Remove(filepath.Join(c.root, stateFilename))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) currentStatus() (Status, error) {
|
func (c *linuxContainer) currentStatus() (Status, error) {
|
||||||
if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil {
|
if err := c.refreshState(); err != nil {
|
||||||
return Checkpointed, nil
|
return -1, err
|
||||||
}
|
}
|
||||||
|
return c.state.status(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// refreshState needs to be called to verify that the current state on the
|
||||||
|
// container is what is true. Because consumers of libcontainer can use it
|
||||||
|
// out of process we need to verify the container's status based on runtime
|
||||||
|
// information and not rely on our in process info.
|
||||||
|
func (c *linuxContainer) refreshState() error {
|
||||||
|
paused, err := c.isPaused()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if paused {
|
||||||
|
return c.state.transition(&pausedState{c: c})
|
||||||
|
}
|
||||||
|
running, err := c.isRunning()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if running {
|
||||||
|
return c.state.transition(&runningState{c: c})
|
||||||
|
}
|
||||||
|
return c.state.transition(&stoppedState{c: c})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *linuxContainer) isRunning() (bool, error) {
|
||||||
if c.initProcess == nil {
|
if c.initProcess == nil {
|
||||||
return Destroyed, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
// return Running if the init process is alive
|
// return Running if the init process is alive
|
||||||
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
|
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
|
||||||
if err == syscall.ESRCH {
|
if err == syscall.ESRCH {
|
||||||
return Destroyed, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
return 0, newSystemError(err)
|
return false, newSystemError(err)
|
||||||
}
|
}
|
||||||
if c.config.Cgroups != nil && c.config.Cgroups.Resources != nil && c.config.Cgroups.Resources.Freezer == configs.Frozen {
|
return true, nil
|
||||||
return Paused, nil
|
}
|
||||||
|
|
||||||
|
func (c *linuxContainer) isPaused() (bool, error) {
|
||||||
|
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return false, newSystemError(err)
|
||||||
}
|
}
|
||||||
return Running, nil
|
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *linuxContainer) currentState() (*State, error) {
|
func (c *linuxContainer) currentState() (*State, error) {
|
||||||
status, err := c.currentStatus()
|
var (
|
||||||
if err != nil {
|
startTime string
|
||||||
return nil, err
|
externalDescriptors []string
|
||||||
}
|
pid = -1
|
||||||
if status == Destroyed {
|
)
|
||||||
return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists)
|
if c.initProcess != nil {
|
||||||
}
|
pid = c.initProcess.pid()
|
||||||
startTime, err := c.initProcess.startTime()
|
startTime, _ = c.initProcess.startTime()
|
||||||
if err != nil {
|
externalDescriptors = c.initProcess.externalDescriptors()
|
||||||
return nil, newSystemError(err)
|
|
||||||
}
|
}
|
||||||
state := &State{
|
state := &State{
|
||||||
BaseState: BaseState{
|
BaseState: BaseState{
|
||||||
ID: c.ID(),
|
ID: c.ID(),
|
||||||
Config: *c.config,
|
Config: *c.config,
|
||||||
InitProcessPid: c.initProcess.pid(),
|
InitProcessPid: pid,
|
||||||
InitProcessStartTime: startTime,
|
InitProcessStartTime: startTime,
|
||||||
|
Created: c.created,
|
||||||
},
|
},
|
||||||
CgroupPaths: c.cgroupManager.GetPaths(),
|
CgroupPaths: c.cgroupManager.GetPaths(),
|
||||||
NamespacePaths: make(map[configs.NamespaceType]string),
|
NamespacePaths: make(map[configs.NamespaceType]string),
|
||||||
ExternalDescriptors: c.initProcess.externalDescriptors(),
|
ExternalDescriptors: externalDescriptors,
|
||||||
}
|
}
|
||||||
for _, ns := range c.config.Namespaces {
|
if pid > 0 {
|
||||||
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
|
for _, ns := range c.config.Namespaces {
|
||||||
}
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
||||||
for _, nsType := range configs.NamespaceTypes() {
|
}
|
||||||
if _, ok := state.NamespacePaths[nsType]; !ok {
|
for _, nsType := range configs.NamespaceTypes() {
|
||||||
ns := configs.Namespace{Type: nsType}
|
if _, ok := state.NamespacePaths[nsType]; !ok {
|
||||||
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
|
ns := configs.Namespace{Type: nsType}
|
||||||
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return state, nil
|
return state, nil
|
||||||
|
|
|
@ -16,9 +16,10 @@ const (
|
||||||
ContainerPaused
|
ContainerPaused
|
||||||
ContainerNotStopped
|
ContainerNotStopped
|
||||||
ContainerNotRunning
|
ContainerNotRunning
|
||||||
|
ContainerNotPaused
|
||||||
|
|
||||||
// Process errors
|
// Process errors
|
||||||
ProcessNotExecuted
|
NoProcessOps
|
||||||
|
|
||||||
// Common errors
|
// Common errors
|
||||||
ConfigInvalid
|
ConfigInvalid
|
||||||
|
@ -46,6 +47,10 @@ func (c ErrorCode) String() string {
|
||||||
return "Container is not running"
|
return "Container is not running"
|
||||||
case ConsoleExists:
|
case ConsoleExists:
|
||||||
return "Console exists for process"
|
return "Console exists for process"
|
||||||
|
case ContainerNotPaused:
|
||||||
|
return "Container is not paused"
|
||||||
|
case NoProcessOps:
|
||||||
|
return "No process operations"
|
||||||
default:
|
default:
|
||||||
return "Unknown error"
|
return "Unknown error"
|
||||||
}
|
}
|
||||||
|
|
|
@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
|
||||||
if err := os.MkdirAll(containerRoot, 0700); err != nil {
|
if err := os.MkdirAll(containerRoot, 0700); err != nil {
|
||||||
return nil, newGenericError(err, SystemError)
|
return nil, newGenericError(err, SystemError)
|
||||||
}
|
}
|
||||||
return &linuxContainer{
|
c := &linuxContainer{
|
||||||
id: id,
|
id: id,
|
||||||
root: containerRoot,
|
root: containerRoot,
|
||||||
config: config,
|
config: config,
|
||||||
|
@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
|
||||||
initArgs: l.InitArgs,
|
initArgs: l.InitArgs,
|
||||||
criuPath: l.CriuPath,
|
criuPath: l.CriuPath,
|
||||||
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
|
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
|
||||||
}, nil
|
}
|
||||||
|
c.state = &stoppedState{c: c}
|
||||||
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *LinuxFactory) Load(id string) (Container, error) {
|
func (l *LinuxFactory) Load(id string) (Container, error) {
|
||||||
|
@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
|
||||||
processStartTime: state.InitProcessStartTime,
|
processStartTime: state.InitProcessStartTime,
|
||||||
fds: state.ExternalDescriptors,
|
fds: state.ExternalDescriptors,
|
||||||
}
|
}
|
||||||
return &linuxContainer{
|
c := &linuxContainer{
|
||||||
initProcess: r,
|
initProcess: r,
|
||||||
id: id,
|
id: id,
|
||||||
config: &state.Config,
|
config: &state.Config,
|
||||||
|
@ -200,7 +202,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
|
||||||
criuPath: l.CriuPath,
|
criuPath: l.CriuPath,
|
||||||
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
|
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
|
||||||
root: containerRoot,
|
root: containerRoot,
|
||||||
}, nil
|
created: state.Created,
|
||||||
|
}
|
||||||
|
c.state = &createdState{c: c, s: Created}
|
||||||
|
if err := c.refreshState(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *LinuxFactory) Type() string {
|
func (l *LinuxFactory) Type() string {
|
||||||
|
@ -222,18 +230,25 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
||||||
// clear the current process's environment to clean any libcontainer
|
// clear the current process's environment to clean any libcontainer
|
||||||
// specific env vars.
|
// specific env vars.
|
||||||
os.Clearenv()
|
os.Clearenv()
|
||||||
|
var i initer
|
||||||
defer func() {
|
defer func() {
|
||||||
// if we have an error during the initialization of the container's init then send it back to the
|
// We have an error during the initialization of the container's init,
|
||||||
// parent process in the form of an initError.
|
// send it back to the parent process in the form of an initError.
|
||||||
if err != nil {
|
// If container's init successed, syscall.Exec will not return, hence
|
||||||
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
|
// this defer function will never be called.
|
||||||
|
if _, ok := i.(*linuxStandardInit); ok {
|
||||||
|
// Synchronisation only necessary for standard init.
|
||||||
|
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
// ensure that this pipe is always closed
|
// ensure that this pipe is always closed
|
||||||
pipe.Close()
|
pipe.Close()
|
||||||
}()
|
}()
|
||||||
i, err := newContainerInit(it, pipe)
|
i, err = newContainerInit(it, pipe)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,18 @@ import (
|
||||||
"github.com/opencontainers/runc/libcontainer/stacktrace"
|
"github.com/opencontainers/runc/libcontainer/stacktrace"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type syncType uint8
|
||||||
|
|
||||||
|
const (
|
||||||
|
procReady syncType = iota
|
||||||
|
procError
|
||||||
|
procRun
|
||||||
|
)
|
||||||
|
|
||||||
|
type syncT struct {
|
||||||
|
Type syncType `json:"type"`
|
||||||
|
}
|
||||||
|
|
||||||
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
|
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
|
||||||
Code: {{.ECode}}
|
Code: {{.ECode}}
|
||||||
{{if .Message }}
|
{{if .Message }}
|
||||||
|
|
|
@ -5,6 +5,7 @@ package libcontainer
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
|
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
|
||||||
}, nil
|
}, nil
|
||||||
case initStandard:
|
case initStandard:
|
||||||
return &linuxStandardInit{
|
return &linuxStandardInit{
|
||||||
|
pipe: pipe,
|
||||||
parentPid: syscall.Getppid(),
|
parentPid: syscall.Getppid(),
|
||||||
config: config,
|
config: config,
|
||||||
}, nil
|
}, nil
|
||||||
|
@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// syncParentReady sends to the given pipe a JSON payload which indicates that
|
||||||
|
// the init is ready to Exec the child process. It then waits for the parent to
|
||||||
|
// indicate that it is cleared to Exec.
|
||||||
|
func syncParentReady(pipe io.ReadWriter) error {
|
||||||
|
// Tell parent.
|
||||||
|
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Wait for parent to give the all-clear.
|
||||||
|
var procSync syncT
|
||||||
|
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
return fmt.Errorf("parent closed synchronisation channel")
|
||||||
|
}
|
||||||
|
if procSync.Type != procRun {
|
||||||
|
return fmt.Errorf("invalid synchronisation flag from parent")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
||||||
// does a setns on the namespace fd so that the current process joins the namespace.
|
// does a setns on the namespace fd so that the current process joins the namespace.
|
||||||
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
||||||
|
@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
|
||||||
if err := m.Freeze(configs.Frozen); err != nil {
|
if err := m.Freeze(configs.Frozen); err != nil {
|
||||||
logrus.Warn(err)
|
logrus.Warn(err)
|
||||||
}
|
}
|
||||||
pids, err := m.GetPids()
|
pids, err := m.GetAllPids()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
m.Freeze(configs.Thawed)
|
m.Freeze(configs.Thawed)
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
// +build linux
|
||||||
|
|
||||||
|
package keyctl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"syscall"
|
||||||
|
"strings"
|
||||||
|
"strconv"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
const KEYCTL_JOIN_SESSION_KEYRING = 1
|
||||||
|
const KEYCTL_SETPERM = 5
|
||||||
|
const KEYCTL_DESCRIBE = 6
|
||||||
|
|
||||||
|
type KeySerial uint32
|
||||||
|
|
||||||
|
func JoinSessionKeyring(name string) (KeySerial, error) {
|
||||||
|
var _name *byte = nil
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if len(name) > 0 {
|
||||||
|
_name, err = syscall.BytePtrFromString(name)
|
||||||
|
if err != nil {
|
||||||
|
return KeySerial(0), err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
|
||||||
|
if errn != 0 {
|
||||||
|
return 0, fmt.Errorf("could not create session key: %v", errn)
|
||||||
|
}
|
||||||
|
return KeySerial(sessKeyId), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// modify permissions on a keyring by reading the current permissions,
|
||||||
|
// anding the bits with the given mask (clearing permissions) and setting
|
||||||
|
// additional permission bits
|
||||||
|
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
|
||||||
|
dest := make([]byte, 1024)
|
||||||
|
destBytes := unsafe.Pointer(&dest[0])
|
||||||
|
|
||||||
|
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
res := strings.Split(string(dest), ";")
|
||||||
|
if len(res) < 5 {
|
||||||
|
return fmt.Errorf("Destination buffer for key description is too small")
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse permissions
|
||||||
|
perm64, err := strconv.ParseUint(res[3], 16, 32)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
perm := (uint32(perm64) & mask) | setbits
|
||||||
|
|
||||||
|
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
|
@ -12,31 +12,32 @@ import (
|
||||||
|
|
||||||
const oomCgroupName = "memory"
|
const oomCgroupName = "memory"
|
||||||
|
|
||||||
// notifyOnOOM returns channel on which you can expect event about OOM,
|
type PressureLevel uint
|
||||||
// if process died without OOM this channel will be closed.
|
|
||||||
// s is current *libcontainer.State for container.
|
const (
|
||||||
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
LowPressure PressureLevel = iota
|
||||||
dir := paths[oomCgroupName]
|
MediumPressure
|
||||||
if dir == "" {
|
CriticalPressure
|
||||||
return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName)
|
)
|
||||||
}
|
|
||||||
oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control"))
|
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
|
||||||
|
evFile, err := os.Open(filepath.Join(cgDir, evName))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
|
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
|
||||||
if syserr != 0 {
|
if syserr != 0 {
|
||||||
oomControl.Close()
|
evFile.Close()
|
||||||
return nil, syserr
|
return nil, syserr
|
||||||
}
|
}
|
||||||
|
|
||||||
eventfd := os.NewFile(fd, "eventfd")
|
eventfd := os.NewFile(fd, "eventfd")
|
||||||
|
|
||||||
eventControlPath := filepath.Join(dir, "cgroup.event_control")
|
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
|
||||||
data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd())
|
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
|
||||||
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
|
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
|
||||||
eventfd.Close()
|
eventfd.Close()
|
||||||
oomControl.Close()
|
evFile.Close()
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
ch := make(chan struct{})
|
ch := make(chan struct{})
|
||||||
|
@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
||||||
defer func() {
|
defer func() {
|
||||||
close(ch)
|
close(ch)
|
||||||
eventfd.Close()
|
eventfd.Close()
|
||||||
oomControl.Close()
|
evFile.Close()
|
||||||
}()
|
}()
|
||||||
buf := make([]byte, 8)
|
buf := make([]byte, 8)
|
||||||
for {
|
for {
|
||||||
|
@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
||||||
}()
|
}()
|
||||||
return ch, nil
|
return ch, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// notifyOnOOM returns channel on which you can expect event about OOM,
|
||||||
|
// if process died without OOM this channel will be closed.
|
||||||
|
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
||||||
|
dir := paths[oomCgroupName]
|
||||||
|
if dir == "" {
|
||||||
|
return nil, fmt.Errorf("path %q missing", oomCgroupName)
|
||||||
|
}
|
||||||
|
|
||||||
|
return registerMemoryEvent(dir, "memory.oom_control", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
|
||||||
|
dir := paths[oomCgroupName]
|
||||||
|
if dir == "" {
|
||||||
|
return nil, fmt.Errorf("path %q missing", oomCgroupName)
|
||||||
|
}
|
||||||
|
|
||||||
|
if level > CriticalPressure {
|
||||||
|
return nil, fmt.Errorf("invalid pressure level %d", level)
|
||||||
|
}
|
||||||
|
|
||||||
|
levelStr := []string{"low", "medium", "critical"}[level]
|
||||||
|
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
|
||||||
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
|
||||||
|
#include <bits/sockaddr.h>
|
||||||
#include <linux/netlink.h>
|
#include <linux/netlink.h>
|
||||||
#include <linux/types.h>
|
#include <linux/types.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
|
@ -55,7 +55,7 @@ type Process struct {
|
||||||
// Wait releases any resources associated with the Process
|
// Wait releases any resources associated with the Process
|
||||||
func (p Process) Wait() (*os.ProcessState, error) {
|
func (p Process) Wait() (*os.ProcessState, error) {
|
||||||
if p.ops == nil {
|
if p.ops == nil {
|
||||||
return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
||||||
}
|
}
|
||||||
return p.ops.wait()
|
return p.ops.wait()
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,7 @@ func (p Process) Pid() (int, error) {
|
||||||
// math.MinInt32 is returned here, because it's invalid value
|
// math.MinInt32 is returned here, because it's invalid value
|
||||||
// for the kill() system call.
|
// for the kill() system call.
|
||||||
if p.ops == nil {
|
if p.ops == nil {
|
||||||
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
||||||
}
|
}
|
||||||
return p.ops.pid(), nil
|
return p.ops.pid(), nil
|
||||||
}
|
}
|
||||||
|
@ -73,7 +73,7 @@ func (p Process) Pid() (int, error) {
|
||||||
// Signal sends a signal to the Process.
|
// Signal sends a signal to the Process.
|
||||||
func (p Process) Signal(sig os.Signal) error {
|
func (p Process) Signal(sig os.Signal) error {
|
||||||
if p.ops == nil {
|
if p.ops == nil {
|
||||||
return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
||||||
}
|
}
|
||||||
return p.ops.signal(sig)
|
return p.ops.signal(sig)
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ package libcontainer
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
@ -87,6 +88,7 @@ func (p *setnsProcess) start() (err error) {
|
||||||
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
|
@ -96,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
|
||||||
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||||
if ierr != nil {
|
if ierr != nil {
|
||||||
p.wait()
|
p.wait()
|
||||||
return newSystemError(ierr)
|
return newSystemError(ierr)
|
||||||
|
@ -199,7 +202,6 @@ func (p *initProcess) start() (err error) {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
p.setExternalDescriptors(fds)
|
p.setExternalDescriptors(fds)
|
||||||
|
|
||||||
// Do this before syncing with child so that no children
|
// Do this before syncing with child so that no children
|
||||||
// can escape the cgroup
|
// can escape the cgroup
|
||||||
if err := p.manager.Apply(p.pid()); err != nil {
|
if err := p.manager.Apply(p.pid()); err != nil {
|
||||||
|
@ -230,13 +232,54 @@ func (p *initProcess) start() (err error) {
|
||||||
if err := p.sendConfig(); err != nil {
|
if err := p.sendConfig(); err != nil {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
// wait for the child process to fully complete and receive an error message
|
var (
|
||||||
// if one was encoutered
|
procSync syncT
|
||||||
var ierr *genericError
|
sentRun bool
|
||||||
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
ierr *genericError
|
||||||
|
)
|
||||||
|
|
||||||
|
loop:
|
||||||
|
for {
|
||||||
|
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
return newSystemError(err)
|
||||||
|
}
|
||||||
|
switch procSync.Type {
|
||||||
|
case procReady:
|
||||||
|
if err := p.manager.Set(p.config.Config); err != nil {
|
||||||
|
return newSystemError(err)
|
||||||
|
}
|
||||||
|
// Sync with child.
|
||||||
|
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
|
||||||
|
return newSystemError(err)
|
||||||
|
}
|
||||||
|
sentRun = true
|
||||||
|
case procError:
|
||||||
|
// wait for the child process to fully complete and receive an error message
|
||||||
|
// if one was encoutered
|
||||||
|
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
||||||
|
return newSystemError(err)
|
||||||
|
}
|
||||||
|
if ierr != nil {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
// Programmer error.
|
||||||
|
panic("No error following JSON procError payload.")
|
||||||
|
default:
|
||||||
|
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !sentRun {
|
||||||
|
return newSystemError(fmt.Errorf("could not synchronise with container process"))
|
||||||
|
}
|
||||||
|
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
||||||
return newSystemError(err)
|
return newSystemError(err)
|
||||||
}
|
}
|
||||||
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||||
if ierr != nil {
|
if ierr != nil {
|
||||||
|
p.wait()
|
||||||
return newSystemError(ierr)
|
return newSystemError(ierr)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
@ -270,12 +313,10 @@ func (p *initProcess) startTime() (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *initProcess) sendConfig() error {
|
func (p *initProcess) sendConfig() error {
|
||||||
// send the state to the container's init process then shutdown writes for the parent
|
// send the config to the container's init process, we don't use JSON Encode
|
||||||
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
// here because there might be a problem in JSON decoder in some cases, see:
|
||||||
return err
|
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
||||||
}
|
return utils.WriteJSON(p.parentPipe, p.config)
|
||||||
// shutdown writes for the parent side of the pipe
|
|
||||||
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *initProcess) createNetworkInterfaces() error {
|
func (p *initProcess) createNetworkInterfaces() error {
|
||||||
|
|
|
@ -18,6 +18,8 @@ import (
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
"github.com/opencontainers/runc/libcontainer/label"
|
"github.com/opencontainers/runc/libcontainer/label"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/system"
|
||||||
|
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
|
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
|
||||||
|
@ -293,12 +295,31 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
|
||||||
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
|
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
|
||||||
// dest is required to be an abs path and have any symlinks resolved before calling this function.
|
// dest is required to be an abs path and have any symlinks resolved before calling this function.
|
||||||
func checkMountDestination(rootfs, dest string) error {
|
func checkMountDestination(rootfs, dest string) error {
|
||||||
if filepath.Clean(rootfs) == filepath.Clean(dest) {
|
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
|
||||||
return fmt.Errorf("mounting into / is prohibited")
|
return fmt.Errorf("mounting into / is prohibited")
|
||||||
}
|
}
|
||||||
invalidDestinations := []string{
|
invalidDestinations := []string{
|
||||||
"/proc",
|
"/proc",
|
||||||
}
|
}
|
||||||
|
// White list, it should be sub directories of invalid destinations
|
||||||
|
validDestinations := []string{
|
||||||
|
// These entries can be bind mounted by files emulated by fuse,
|
||||||
|
// so commands like top, free displays stats in container.
|
||||||
|
"/proc/cpuinfo",
|
||||||
|
"/proc/diskstats",
|
||||||
|
"/proc/meminfo",
|
||||||
|
"/proc/stat",
|
||||||
|
"/proc/net/dev",
|
||||||
|
}
|
||||||
|
for _, valid := range validDestinations {
|
||||||
|
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if path == "." {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
for _, invalid := range invalidDestinations {
|
for _, invalid := range invalidDestinations {
|
||||||
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
|
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -321,7 +342,7 @@ func setupDevSymlinks(rootfs string) error {
|
||||||
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
|
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
|
||||||
// in /dev if it exists in /proc.
|
// in /dev if it exists in /proc.
|
||||||
if _, err := os.Stat("/proc/kcore"); err == nil {
|
if _, err := os.Stat("/proc/kcore"); err == nil {
|
||||||
links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
|
links = append(links, [2]string{"/proc/kcore", "/dev/core"})
|
||||||
}
|
}
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
var (
|
var (
|
||||||
|
@ -365,11 +386,12 @@ func reOpenDevNull() error {
|
||||||
|
|
||||||
// Create the device nodes in the container.
|
// Create the device nodes in the container.
|
||||||
func createDevices(config *configs.Config) error {
|
func createDevices(config *configs.Config) error {
|
||||||
|
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
|
||||||
oldMask := syscall.Umask(0000)
|
oldMask := syscall.Umask(0000)
|
||||||
for _, node := range config.Devices {
|
for _, node := range config.Devices {
|
||||||
// containers running in a user namespace are not allowed to mknod
|
// containers running in a user namespace are not allowed to mknod
|
||||||
// devices so we can just bind mount it from the host.
|
// devices so we can just bind mount it from the host.
|
||||||
if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil {
|
if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
|
||||||
syscall.Umask(oldMask)
|
syscall.Umask(oldMask)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func selinuxEnforcePath() string {
|
||||||
|
return fmt.Sprintf("%s/enforce", selinuxPath)
|
||||||
|
}
|
||||||
|
|
||||||
func SelinuxGetEnforce() int {
|
func SelinuxGetEnforce() int {
|
||||||
var enforce int
|
var enforce int
|
||||||
|
|
||||||
enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath))
|
enforceS, err := readCon(selinuxEnforcePath())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
|
||||||
return enforce
|
return enforce
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SelinuxSetEnforce(mode int) error {
|
||||||
|
return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
|
||||||
|
}
|
||||||
|
|
||||||
func SelinuxGetEnforceMode() int {
|
func SelinuxGetEnforceMode() int {
|
||||||
switch readConfig(selinuxTag) {
|
switch readConfig(selinuxTag) {
|
||||||
case "enforcing":
|
case "enforcing":
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/keys"
|
||||||
"github.com/opencontainers/runc/libcontainer/label"
|
"github.com/opencontainers/runc/libcontainer/label"
|
||||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||||
"github.com/opencontainers/runc/libcontainer/system"
|
"github.com/opencontainers/runc/libcontainer/system"
|
||||||
|
@ -18,12 +19,21 @@ type linuxSetnsInit struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *linuxSetnsInit) Init() error {
|
func (l *linuxSetnsInit) Init() error {
|
||||||
|
// do not inherit the parent's session keyring
|
||||||
|
if _, err := keyctl.JoinSessionKeyring("_ses"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if err := setupRlimits(l.config.Config); err != nil {
|
if err := setupRlimits(l.config.Config); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if l.config.Config.NoNewPrivileges {
|
||||||
|
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
if l.config.Config.Seccomp != nil {
|
if l.config.Config.Seccomp != nil {
|
||||||
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -3,22 +3,41 @@
|
||||||
package libcontainer
|
package libcontainer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/keys"
|
||||||
"github.com/opencontainers/runc/libcontainer/label"
|
"github.com/opencontainers/runc/libcontainer/label"
|
||||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||||
"github.com/opencontainers/runc/libcontainer/system"
|
"github.com/opencontainers/runc/libcontainer/system"
|
||||||
)
|
)
|
||||||
|
|
||||||
type linuxStandardInit struct {
|
type linuxStandardInit struct {
|
||||||
|
pipe io.ReadWriter
|
||||||
parentPid int
|
parentPid int
|
||||||
config *initConfig
|
config *initConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
|
||||||
|
// the kernel
|
||||||
|
const PR_SET_NO_NEW_PRIVS = 0x26
|
||||||
|
|
||||||
func (l *linuxStandardInit) Init() error {
|
func (l *linuxStandardInit) Init() error {
|
||||||
|
// do not inherit the parent's session keyring
|
||||||
|
sessKeyId, err := keyctl.JoinSessionKeyring("")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// make session keyring searcheable
|
||||||
|
// without user ns we need 'UID' search permissions
|
||||||
|
// with user ns we need 'other' search permissions
|
||||||
|
if err := keyctl.ModKeyringPerm(sessKeyId, 0xffffffff, 0x080008); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// join any namespaces via a path to the namespace fd if provided
|
// join any namespaces via a path to the namespace fd if provided
|
||||||
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
|
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -50,7 +69,6 @@ func (l *linuxStandardInit) Init() error {
|
||||||
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
label.Init()
|
label.Init()
|
||||||
// InitializeMountNamespace() can be executed only for a new mount namespace
|
// InitializeMountNamespace() can be executed only for a new mount namespace
|
||||||
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||||
|
@ -75,7 +93,6 @@ func (l *linuxStandardInit) Init() error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, path := range l.config.Config.ReadonlyPaths {
|
for _, path := range l.config.Config.ReadonlyPaths {
|
||||||
if err := remountReadonly(path); err != nil {
|
if err := remountReadonly(path); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -90,6 +107,17 @@ func (l *linuxStandardInit) Init() error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if l.config.Config.NoNewPrivileges {
|
||||||
|
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Tell our parent that we're ready to Execv. This must be done before the
|
||||||
|
// Seccomp rules have been applied, because we need to be able to read and
|
||||||
|
// write to a socket.
|
||||||
|
if err := syncParentReady(l.pipe); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if l.config.Config.Seccomp != nil {
|
if l.config.Config.Seccomp != nil {
|
||||||
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -109,5 +137,6 @@ func (l *linuxStandardInit) Init() error {
|
||||||
if syscall.Getppid() != l.parentPid {
|
if syscall.Getppid() != l.parentPid {
|
||||||
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
||||||
}
|
}
|
||||||
|
|
||||||
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,226 @@
|
||||||
|
// +build linux
|
||||||
|
|
||||||
|
package libcontainer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/Sirupsen/logrus"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
)
|
||||||
|
|
||||||
|
func newStateTransitionError(from, to containerState) error {
|
||||||
|
return &stateTransitionError{
|
||||||
|
From: from.status().String(),
|
||||||
|
To: to.status().String(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// stateTransitionError is returned when an invalid state transition happens from one
|
||||||
|
// state to another.
|
||||||
|
type stateTransitionError struct {
|
||||||
|
From string
|
||||||
|
To string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stateTransitionError) Error() string {
|
||||||
|
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
|
||||||
|
}
|
||||||
|
|
||||||
|
type containerState interface {
|
||||||
|
transition(containerState) error
|
||||||
|
destroy() error
|
||||||
|
status() Status
|
||||||
|
}
|
||||||
|
|
||||||
|
func destroy(c *linuxContainer) error {
|
||||||
|
if !c.config.Namespaces.Contains(configs.NEWPID) {
|
||||||
|
if err := killCgroupProcesses(c.cgroupManager); err != nil {
|
||||||
|
logrus.Warn(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err := c.cgroupManager.Destroy()
|
||||||
|
if rerr := os.RemoveAll(c.root); err == nil {
|
||||||
|
err = rerr
|
||||||
|
}
|
||||||
|
c.initProcess = nil
|
||||||
|
if herr := runPoststopHooks(c); err == nil {
|
||||||
|
err = herr
|
||||||
|
}
|
||||||
|
c.state = &stoppedState{c: c}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func runPoststopHooks(c *linuxContainer) error {
|
||||||
|
if c.config.Hooks != nil {
|
||||||
|
s := configs.HookState{
|
||||||
|
Version: c.config.Version,
|
||||||
|
ID: c.id,
|
||||||
|
Root: c.config.Rootfs,
|
||||||
|
}
|
||||||
|
for _, hook := range c.config.Hooks.Poststop {
|
||||||
|
if err := hook.Run(s); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// stoppedState represents a container is a stopped/destroyed state.
|
||||||
|
type stoppedState struct {
|
||||||
|
c *linuxContainer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *stoppedState) status() Status {
|
||||||
|
return Destroyed
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *stoppedState) transition(s containerState) error {
|
||||||
|
switch s.(type) {
|
||||||
|
case *runningState:
|
||||||
|
b.c.state = s
|
||||||
|
return nil
|
||||||
|
case *restoredState:
|
||||||
|
b.c.state = s
|
||||||
|
return nil
|
||||||
|
case *stoppedState:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return newStateTransitionError(b, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *stoppedState) destroy() error {
|
||||||
|
return destroy(b.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// runningState represents a container that is currently running.
|
||||||
|
type runningState struct {
|
||||||
|
c *linuxContainer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *runningState) status() Status {
|
||||||
|
return Running
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *runningState) transition(s containerState) error {
|
||||||
|
switch s.(type) {
|
||||||
|
case *stoppedState:
|
||||||
|
running, err := r.c.isRunning()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if running {
|
||||||
|
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
|
||||||
|
}
|
||||||
|
r.c.state = s
|
||||||
|
return nil
|
||||||
|
case *pausedState:
|
||||||
|
r.c.state = s
|
||||||
|
return nil
|
||||||
|
case *runningState:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return newStateTransitionError(r, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *runningState) destroy() error {
|
||||||
|
running, err := r.c.isRunning()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if running {
|
||||||
|
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
|
||||||
|
}
|
||||||
|
return destroy(r.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// pausedState represents a container that is currently pause. It cannot be destroyed in a
|
||||||
|
// paused state and must transition back to running first.
|
||||||
|
type pausedState struct {
|
||||||
|
c *linuxContainer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pausedState) status() Status {
|
||||||
|
return Paused
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pausedState) transition(s containerState) error {
|
||||||
|
switch s.(type) {
|
||||||
|
case *runningState, *stoppedState:
|
||||||
|
p.c.state = s
|
||||||
|
return nil
|
||||||
|
case *pausedState:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return newStateTransitionError(p, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pausedState) destroy() error {
|
||||||
|
isRunning, err := p.c.isRunning()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !isRunning {
|
||||||
|
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return destroy(p.c)
|
||||||
|
}
|
||||||
|
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
|
||||||
|
}
|
||||||
|
|
||||||
|
// restoredState is the same as the running state but also has accociated checkpoint
|
||||||
|
// information that maybe need destroyed when the container is stopped and destory is called.
|
||||||
|
type restoredState struct {
|
||||||
|
imageDir string
|
||||||
|
c *linuxContainer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *restoredState) status() Status {
|
||||||
|
return Running
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *restoredState) transition(s containerState) error {
|
||||||
|
switch s.(type) {
|
||||||
|
case *stoppedState:
|
||||||
|
return nil
|
||||||
|
case *runningState:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return newStateTransitionError(r, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *restoredState) destroy() error {
|
||||||
|
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
|
||||||
|
if !os.IsNotExist(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return destroy(r.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// createdState is used whenever a container is restored, loaded, or setting additional
|
||||||
|
// processes inside and it should not be destroyed when it is exiting.
|
||||||
|
type createdState struct {
|
||||||
|
c *linuxContainer
|
||||||
|
s Status
|
||||||
|
}
|
||||||
|
|
||||||
|
func (n *createdState) status() Status {
|
||||||
|
return n.s
|
||||||
|
}
|
||||||
|
|
||||||
|
func (n *createdState) transition(s containerState) error {
|
||||||
|
n.c.state = s
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (n *createdState) destroy() error {
|
||||||
|
if err := n.c.refreshState(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return n.c.state.destroy()
|
||||||
|
}
|
|
@ -3,6 +3,9 @@
|
||||||
package system
|
package system
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
@ -75,3 +78,45 @@ func Setctty() error {
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Detect whether we are currently running in a user namespace.
|
||||||
|
* Copied from github.com/lxc/lxd/shared/util.go
|
||||||
|
*/
|
||||||
|
func RunningInUserNS() bool {
|
||||||
|
file, err := os.Open("/proc/self/uid_map")
|
||||||
|
if err != nil {
|
||||||
|
/*
|
||||||
|
* This kernel-provided file only exists if user namespaces are
|
||||||
|
* supported
|
||||||
|
*/
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
buf := bufio.NewReader(file)
|
||||||
|
l, _, err := buf.ReadLine()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
line := string(l)
|
||||||
|
var a, b, c int64
|
||||||
|
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
|
||||||
|
/*
|
||||||
|
* We assume we are in the initial user namespace if we have a full
|
||||||
|
* range - 4294967295 uids starting at uid 0.
|
||||||
|
*/
|
||||||
|
if a == 0 && b == 0 && c == 4294967295 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
|
||||||
|
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
|
||||||
|
if e1 != 0 {
|
||||||
|
err = e1
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io"
|
"io"
|
||||||
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
@ -54,3 +55,32 @@ func WriteJSON(w io.Writer, v interface{}) error {
|
||||||
_, err = w.Write(data)
|
_, err = w.Write(data)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CleanPath makes a path safe for use with filepath.Join. This is done by not
|
||||||
|
// only cleaning the path, but also (if the path is relative) adding a leading
|
||||||
|
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
||||||
|
// path resulting from prepending another path will always resolve to lexically
|
||||||
|
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
||||||
|
// that include symlinks won't be safe as a result of using CleanPath.
|
||||||
|
func CleanPath(path string) string {
|
||||||
|
// Deal with empty strings nicely.
|
||||||
|
if path == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that all paths are cleaned (especially problematic ones like
|
||||||
|
// "/../../../../../" which can cause lots of issues).
|
||||||
|
path = filepath.Clean(path)
|
||||||
|
|
||||||
|
// If the path isn't absolute, we need to do more processing to fix paths
|
||||||
|
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
||||||
|
// paths to relative ones.
|
||||||
|
if !filepath.IsAbs(path) {
|
||||||
|
path = filepath.Clean(string(os.PathSeparator) + path)
|
||||||
|
// This can't fail, as (by definition) all paths are relative to root.
|
||||||
|
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean the path again for good measure.
|
||||||
|
return filepath.Clean(path)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue