diff --git a/MAINTAINERS b/MAINTAINERS index 4a6c0ec22c..895fba563a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6,4 +6,4 @@ Michael Crosby (@crosbymichael) api.go: Victor Vieux (@vieux) Dockerfile: Tianon Gravi (@tianon) Makefile: Tianon Gravi (@tianon) -Vagrantfile: Daniel Mizyrycki (@mzdaniel) +Vagrantfile: Cristian Staretu (@unclejack) diff --git a/Makefile b/Makefile index 275f9dc84c..168707a80f 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all binary build cross default docs docs-build docs-shell shell test +.PHONY: all binary build cross default docs docs-build docs-shell shell test test-integration GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) DOCKER_IMAGE := docker:$(GIT_BRANCH) @@ -25,6 +25,9 @@ docs-shell: docs-build test: build $(DOCKER_RUN_DOCKER) hack/make.sh test test-integration +test-integration: build + $(DOCKER_RUN_DOCKER) hack/make.sh test-integration + shell: build $(DOCKER_RUN_DOCKER) bash diff --git a/Vagrantfile b/Vagrantfile index c130587829..f709031fdf 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -8,10 +8,9 @@ AWS_BOX_URI = ENV['BOX_URI'] || "https://github.com/mitchellh/vagrant-aws/raw/ma AWS_REGION = ENV['AWS_REGION'] || "us-east-1" AWS_AMI = ENV['AWS_AMI'] || "ami-69f5a900" AWS_INSTANCE_TYPE = ENV['AWS_INSTANCE_TYPE'] || 't1.micro' - FORWARD_DOCKER_PORTS = ENV['FORWARD_DOCKER_PORTS'] - -SSH_PRIVKEY_PATH = ENV["SSH_PRIVKEY_PATH"] +SSH_PRIVKEY_PATH = ENV['SSH_PRIVKEY_PATH'] +PRIVATE_NETWORK = ENV['PRIVATE_NETWORK'] # A script to upgrade from the 12.04 kernel to the raring backport kernel (3.8) # and install docker. @@ -174,3 +173,14 @@ if !FORWARD_DOCKER_PORTS.nil? end end end + +if !PRIVATE_NETWORK.nil? + Vagrant::VERSION < "1.1.0" and Vagrant::Config.run do |config| + config.vm.network :hostonly, PRIVATE_NETWORK + end + + Vagrant::VERSION >= "1.1.0" and Vagrant.configure("2") do |config| + config.vm.network "private_network", ip: PRIVATE_NETWORK + end +end + diff --git a/commands.go b/commands.go index 981ae3d0b8..3fda195e17 100644 --- a/commands.go +++ b/commands.go @@ -1678,7 +1678,7 @@ func (cli *DockerCli) CmdSearch(args ...string) error { v := url.Values{} v.Set("term", cmd.Arg(0)) - body, _, err := readBody(cli.call("GET", "/images/search?"+v.Encode(), nil, false)) + body, _, err := readBody(cli.call("GET", "/images/search?"+v.Encode(), nil, true)) if err != nil { return err diff --git a/config.go b/config.go index aad5d50fc0..cb7e985ca2 100644 --- a/config.go +++ b/config.go @@ -23,29 +23,31 @@ type DaemonConfig struct { // ConfigFromJob creates and returns a new DaemonConfig object // by parsing the contents of a job's environment. -func ConfigFromJob(job *engine.Job) *DaemonConfig { - var config DaemonConfig - config.Pidfile = job.Getenv("Pidfile") - config.Root = job.Getenv("Root") - config.AutoRestart = job.GetenvBool("AutoRestart") +func DaemonConfigFromJob(job *engine.Job) *DaemonConfig { + config := &DaemonConfig{ + Pidfile: job.Getenv("Pidfile"), + Root: job.Getenv("Root"), + AutoRestart: job.GetenvBool("AutoRestart"), + EnableIptables: job.GetenvBool("EnableIptables"), + EnableIpForward: job.GetenvBool("EnableIpForward"), + BridgeIp: job.Getenv("BridgeIp"), + DefaultIp: net.ParseIP(job.Getenv("DefaultIp")), + InterContainerCommunication: job.GetenvBool("InterContainerCommunication"), + GraphDriver: job.Getenv("GraphDriver"), + } if dns := job.GetenvList("Dns"); dns != nil { config.Dns = dns } - config.EnableIptables = job.GetenvBool("EnableIptables") - config.EnableIpForward = job.GetenvBool("EnableIpForward") if br := job.Getenv("BridgeIface"); br != "" { config.BridgeIface = br } else { config.BridgeIface = DefaultNetworkBridge } - config.BridgeIp = job.Getenv("BridgeIp") - config.DefaultIp = net.ParseIP(job.Getenv("DefaultIp")) - config.InterContainerCommunication = job.GetenvBool("InterContainerCommunication") - config.GraphDriver = job.Getenv("GraphDriver") - if mtu := job.GetenvInt("Mtu"); mtu != -1 { + if mtu := job.GetenvInt("Mtu"); mtu != 0 { config.Mtu = mtu } else { config.Mtu = DefaultNetworkMtu } - return &config + + return config } diff --git a/container.go b/container.go index c5df1f4b58..95e81e2063 100644 --- a/container.go +++ b/container.go @@ -104,6 +104,46 @@ type Config struct { NetworkDisabled bool } +func ContainerConfigFromJob(job *engine.Job) *Config { + config := &Config{ + Hostname: job.Getenv("Hostname"), + Domainname: job.Getenv("Domainname"), + User: job.Getenv("User"), + Memory: job.GetenvInt64("Memory"), + MemorySwap: job.GetenvInt64("MemorySwap"), + CpuShares: job.GetenvInt64("CpuShares"), + AttachStdin: job.GetenvBool("AttachStdin"), + AttachStdout: job.GetenvBool("AttachStdout"), + AttachStderr: job.GetenvBool("AttachStderr"), + Tty: job.GetenvBool("Tty"), + OpenStdin: job.GetenvBool("OpenStdin"), + StdinOnce: job.GetenvBool("StdinOnce"), + Image: job.Getenv("Image"), + VolumesFrom: job.Getenv("VolumesFrom"), + WorkingDir: job.Getenv("WorkingDir"), + NetworkDisabled: job.GetenvBool("NetworkDisabled"), + } + job.GetenvJson("ExposedPorts", &config.ExposedPorts) + job.GetenvJson("Volumes", &config.Volumes) + if PortSpecs := job.GetenvList("PortSpecs"); PortSpecs != nil { + config.PortSpecs = PortSpecs + } + if Env := job.GetenvList("Env"); Env != nil { + config.Env = Env + } + if Cmd := job.GetenvList("Cmd"); Cmd != nil { + config.Cmd = Cmd + } + if Dns := job.GetenvList("Dns"); Dns != nil { + config.Dns = Dns + } + if Entrypoint := job.GetenvList("Entrypoint"); Entrypoint != nil { + config.Entrypoint = Entrypoint + } + + return config +} + type HostConfig struct { Binds []string ContainerIDFile string @@ -114,6 +154,24 @@ type HostConfig struct { PublishAllPorts bool } +func ContainerHostConfigFromJob(job *engine.Job) *HostConfig { + hostConfig := &HostConfig{ + ContainerIDFile: job.Getenv("ContainerIDFile"), + Privileged: job.GetenvBool("Privileged"), + PublishAllPorts: job.GetenvBool("PublishAllPorts"), + } + job.GetenvJson("LxcConf", &hostConfig.LxcConf) + job.GetenvJson("PortBindings", &hostConfig.PortBindings) + if Binds := job.GetenvList("Binds"); Binds != nil { + hostConfig.Binds = Binds + } + if Links := job.GetenvList("Links"); Links != nil { + hostConfig.Links = Links + } + + return hostConfig +} + type BindMap struct { SrcPath string DstPath string diff --git a/contrib/init/systemd/docker.service b/contrib/init/systemd/docker.service index aae7b6daf9..387be2eb1c 100644 --- a/contrib/init/systemd/docker.service +++ b/contrib/init/systemd/docker.service @@ -1,11 +1,11 @@ [Unit] -Description=Docker Application Container Engine +Description=Docker Application Container Engine Documentation=http://docs.docker.io After=network.target [Service] -ExecStartPre=/bin/mount --make-rprivate / ExecStart=/usr/bin/docker -d +Restart=on-failure [Install] WantedBy=multi-user.target diff --git a/contrib/init/systemd/socket-activation/docker.service b/contrib/init/systemd/socket-activation/docker.service index 4ab92dfef8..c795f9c3b4 100644 --- a/contrib/init/systemd/socket-activation/docker.service +++ b/contrib/init/systemd/socket-activation/docker.service @@ -5,6 +5,7 @@ After=network.target [Service] ExecStart=/usr/bin/docker -d -H fd:// +Restart=on-failure [Install] WantedBy=multi-user.target diff --git a/contrib/mkimage-rinse.sh b/contrib/mkimage-rinse.sh index de9265d48c..dfe9999d92 100755 --- a/contrib/mkimage-rinse.sh +++ b/contrib/mkimage-rinse.sh @@ -1,4 +1,11 @@ #!/usr/bin/env bash +# +# Create a base CentOS Docker image. + +# This script is useful on systems with rinse available (e.g., +# building a CentOS image on Debian). See contrib/mkimage-yum.sh for +# a way to build CentOS images on systems with yum installed. + set -e repo="$1" diff --git a/contrib/mkimage-yum.sh b/contrib/mkimage-yum.sh new file mode 100755 index 0000000000..54e99f1f04 --- /dev/null +++ b/contrib/mkimage-yum.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# +# Create a base CentOS Docker image. +# +# This script is useful on systems with yum installed (e.g., building +# a CentOS image on CentOS). See contrib/mkimage-rinse.sh for a way +# to build CentOS images on other systems. + +usage() { + cat < +OPTIONS: + -y The path to the yum config to install packages from. The + default is /etc/yum.conf. +EOOPTS + exit 1 +} + +# option defaults +yum_config=/etc/yum.conf +while getopts ":y:h" opt; do + case $opt in + y) + yum_config=$OPTARG + ;; + h) + usage + ;; + \?) + echo "Invalid option: -$OPTARG" + usage + ;; + esac +done +shift $((OPTIND - 1)) +name=$1 + +if [[ -z $name ]]; then + usage +fi + +#-------------------- + +target=$(mktemp -d --tmpdir $(basename $0).XXXXXX) + +set -x + +for dev in console null zero urandom; do + /sbin/MAKEDEV -d "$target"/dev -x $dev +done + +yum -c "$yum_config" --installroot="$target" --setopt=tsflags=nodocs \ + --setopt=group_package_types=mandatory -y groupinstall Core +yum -c "$yum_config" --installroot="$mount" -y clean all + +cat > "$target"/etc/sysconfig/network <&2 "warning: cannot autodetect OS version, using '$name' as tag" + version=$name +fi + +tar --numeric-owner -c -C "$target" . | docker import - $name:$version +docker run -i -t $name:$version echo success + +rm -rf "$target" diff --git a/docs/sources/articles/baseimages.rst b/docs/sources/articles/baseimages.rst index 68251bedd3..6fd1823f8d 100644 --- a/docs/sources/articles/baseimages.rst +++ b/docs/sources/articles/baseimages.rst @@ -37,7 +37,10 @@ There are more example scripts for creating base images in the Docker GitHub Repo: * `BusyBox `_ -* `CentOS / Scientific Linux CERN (SLC) +* CentOS / Scientific Linux CERN (SLC) `on Debian/Ubuntu `_ + or + `on CentOS/RHEL/SLC/etc. + `_ * `Debian / Ubuntu `_ diff --git a/docs/sources/articles/index.rst b/docs/sources/articles/index.rst index 2cfc427420..75c0cd3fa9 100644 --- a/docs/sources/articles/index.rst +++ b/docs/sources/articles/index.rst @@ -12,3 +12,4 @@ Articles security baseimages + runmetrics diff --git a/docs/sources/articles/runmetrics.rst b/docs/sources/articles/runmetrics.rst new file mode 100644 index 0000000000..afb7f82e39 --- /dev/null +++ b/docs/sources/articles/runmetrics.rst @@ -0,0 +1,463 @@ +:title: Runtime Metrics +:description: Measure the behavior of running containers +:keywords: docker, metrics, CPU, memory, disk, IO, run, runtime + +.. _run_metrics: + + +Runtime Metrics +=============== + +Linux Containers rely on `control groups +`_ which +not only track groups of processes, but also expose metrics about CPU, +memory, and block I/O usage. You can access those metrics and obtain +network usage metrics as well. This is relevant for "pure" LXC +containers, as well as for Docker containers. + +Control Groups +-------------- + +Control groups are exposed through a pseudo-filesystem. In recent +distros, you should find this filesystem under +``/sys/fs/cgroup``. Under that directory, you will see multiple +sub-directories, called devices, freezer, blkio, etc.; each +sub-directory actually corresponds to a different cgroup hierarchy. + +On older systems, the control groups might be mounted on ``/cgroup``, +without distinct hierarchies. In that case, instead of seeing the +sub-directories, you will see a bunch of files in that directory, and +possibly some directories corresponding to existing containers. + +To figure out where your control groups are mounted, you can run: + +:: + + grep cgroup /proc/mounts + +.. _run_findpid: + +Enumerating Cgroups +------------------- + +You can look into ``/proc/cgroups`` to see the different control group +subsystems known to the system, the hierarchy they belong to, and how +many groups they contain. + +You can also look at ``/proc//cgroup`` to see which control +groups a process belongs to. The control group will be shown as a path +relative to the root of the hierarchy mountpoint; e.g. ``/`` means +“this process has not been assigned into a particular group”, while +``/lxc/pumpkin`` means that the process is likely to be a member of a +container named ``pumpkin``. + +Finding the Cgroup for a Given Container +---------------------------------------- + +For each container, one cgroup will be created in each hierarchy. On +older systems with older versions of the LXC userland tools, the name +of the cgroup will be the name of the container. With more recent +versions of the LXC tools, the cgroup will be ``lxc/.`` + +For Docker containers using cgroups, the container name will be the +full ID or long ID of the container. If a container shows up as +ae836c95b4c3 in ``docker ps``, its long ID might be something like +``ae836c95b4c3c9e9179e0e91015512da89fdec91612f63cebae57df9a5444c79``. You +can look it up with ``docker inspect`` or ``docker ps -notrunc``. + +Putting everything together to look at the memory metrics for a Docker +container, take a look at ``/sys/fs/cgroup/memory/lxc//``. + +Metrics from Cgroups: Memory, CPU, Block IO +------------------------------------------- + +For each subsystem (memory, CPU, and block I/O), you will find one or +more pseudo-files containing statistics. + +Memory Metrics: ``memory.stat`` +............................... + +Memory metrics are found in the "memory" cgroup. Note that the memory +control group adds a little overhead, because it does very +fine-grained accounting of the memory usage on your host. Therefore, +many distros chose to not enable it by default. Generally, to enable +it, all you have to do is to add some kernel command-line parameters: +``cgroup_enable=memory swapaccount=1``. + +The metrics are in the pseudo-file ``memory.stat``. Here is what it +will look like: + +:: + + cache 11492564992 + rss 1930993664 + mapped_file 306728960 + pgpgin 406632648 + pgpgout 403355412 + swap 0 + pgfault 728281223 + pgmajfault 1724 + inactive_anon 46608384 + active_anon 1884520448 + inactive_file 7003344896 + active_file 4489052160 + unevictable 32768 + hierarchical_memory_limit 9223372036854775807 + hierarchical_memsw_limit 9223372036854775807 + total_cache 11492564992 + total_rss 1930993664 + total_mapped_file 306728960 + total_pgpgin 406632648 + total_pgpgout 403355412 + total_swap 0 + total_pgfault 728281223 + total_pgmajfault 1724 + total_inactive_anon 46608384 + total_active_anon 1884520448 + total_inactive_file 7003344896 + total_active_file 4489052160 + total_unevictable 32768 + +The first half (without the ``total_`` prefix) contains statistics +relevant to the processes within the cgroup, excluding +sub-cgroups. The second half (with the ``total_`` prefix) includes +sub-cgroups as well. + +Some metrics are "gauges", i.e. values that can increase or decrease +(e.g. swap, the amount of swap space used by the members of the +cgroup). Some others are "counters", i.e. values that can only go up, +because they represent occurrences of a specific event (e.g. pgfault, +which indicates the number of page faults which happened since the +creation of the cgroup; this number can never decrease). + +cache + the amount of memory used by the processes of this control group + that can be associated precisely with a block on a block + device. When you read from and write to files on disk, this amount + will increase. This will be the case if you use "conventional" I/O + (``open``, ``read``, ``write`` syscalls) as well as mapped files + (with ``mmap``). It also accounts for the memory used by ``tmpfs`` + mounts, though the reasons are unclear. + +rss + the amount of memory that *doesn't* correspond to anything on + disk: stacks, heaps, and anonymous memory maps. + +mapped_file + indicates the amount of memory mapped by the processes in the + control group. It doesn't give you information about *how much* + memory is used; it rather tells you *how* it is used. + +pgfault and pgmajfault + indicate the number of times that a process of the cgroup triggered + a "page fault" and a "major fault", respectively. A page fault + happens when a process accesses a part of its virtual memory space + which is nonexistent or protected. The former can happen if the + process is buggy and tries to access an invalid address (it will + then be sent a ``SIGSEGV`` signal, typically killing it with the + famous ``Segmentation fault`` message). The latter can happen when + the process reads from a memory zone which has been swapped out, or + which corresponds to a mapped file: in that case, the kernel will + load the page from disk, and let the CPU complete the memory + access. It can also happen when the process writes to a + copy-on-write memory zone: likewise, the kernel will preempt the + process, duplicate the memory page, and resume the write operation + on the process' own copy of the page. "Major" faults happen when the + kernel actually has to read the data from disk. When it just has to + duplicate an existing page, or allocate an empty page, it's a + regular (or "minor") fault. + +swap + the amount of swap currently used by the processes in this cgroup. + +active_anon and inactive_anon + the amount of *anonymous* memory that has been identified has + respectively *active* and *inactive* by the kernel. "Anonymous" + memory is the memory that is *not* linked to disk pages. In other + words, that's the equivalent of the rss counter described above. In + fact, the very definition of the rss counter is **active_anon** + + **inactive_anon** - **tmpfs** (where tmpfs is the amount of memory + used up by ``tmpfs`` filesystems mounted by this control + group). Now, what's the difference between "active" and "inactive"? + Pages are initially "active"; and at regular intervals, the kernel + sweeps over the memory, and tags some pages as "inactive". Whenever + they are accessed again, they are immediately retagged + "active". When the kernel is almost out of memory, and time comes to + swap out to disk, the kernel will swap "inactive" pages. + +active_file and inactive_file + cache memory, with *active* and *inactive* similar to the *anon* + memory above. The exact formula is cache = **active_file** + + **inactive_file** + **tmpfs**. The exact rules used by the kernel to + move memory pages between active and inactive sets are different + from the ones used for anonymous memory, but the general principle + is the same. Note that when the kernel needs to reclaim memory, it + is cheaper to reclaim a clean (=non modified) page from this pool, + since it can be reclaimed immediately (while anonymous pages and + dirty/modified pages have to be written to disk first). + +unevictable + the amount of memory that cannot be reclaimed; generally, it will + account for memory that has been "locked" with ``mlock``. It is + often used by crypto frameworks to make sure that secret keys and + other sensitive material never gets swapped out to disk. + +memory and memsw limits + These are not really metrics, but a reminder of the limits applied + to this cgroup. The first one indicates the maximum amount of + physical memory that can be used by the processes of this control + group; the second one indicates the maximum amount of RAM+swap. + +Accounting for memory in the page cache is very complex. If two +processes in different control groups both read the same file +(ultimately relying on the same blocks on disk), the corresponding +memory charge will be split between the control groups. It's nice, but +it also means that when a cgroup is terminated, it could increase the +memory usage of another cgroup, because they are not splitting the +cost anymore for those memory pages. + +CPU metrics: ``cpuacct.stat`` +............................. + +Now that we've covered memory metrics, everything else will look very +simple in comparison. CPU metrics will be found in the ``cpuacct`` +controller. + +For each container, you will find a pseudo-file ``cpuacct.stat``, +containing the CPU usage accumulated by the processes of the +container, broken down between ``user`` and ``system`` time. If you're +not familiar with the distinction, ``user`` is the time during which +the processes were in direct control of the CPU (i.e. executing +process code), and ``system`` is the time during which the CPU was +executing system calls on behalf of those processes. + +Those times are expressed in ticks of 1/100th of a second. Actually, +they are expressed in "user jiffies". There are ``USER_HZ`` +*"jiffies"* per second, and on x86 systems, ``USER_HZ`` is 100. This +used to map exactly to the number of scheduler "ticks" per second; but +with the advent of higher frequency scheduling, as well as `tickless +kernels `_, the number of kernel +ticks wasn't relevant anymore. It stuck around anyway, mainly for +legacy and compatibility reasons. + +Block I/O metrics +................. + +Block I/O is accounted in the ``blkio`` controller. Different metrics +are scattered across different files. While you can find in-depth +details in the `blkio-controller +`_ +file in the kernel documentation, here is a short list of the most +relevant ones: + +blkio.sectors + contain the number of 512-bytes sectors read and written by the + processes member of the cgroup, device by device. Reads and writes + are merged in a single counter. + +blkio.io_service_bytes + indicates the number of bytes read and written by the cgroup. It has + 4 counters per device, because for each device, it differentiates + between synchronous vs. asynchronous I/O, and reads vs. writes. + +blkio.io_serviced + the number of I/O operations performed, regardless of their size. It + also has 4 counters per device. + +blkio.io_queued + indicates the number of I/O operations currently queued for this + cgroup. In other words, if the cgroup isn't doing any I/O, this will + be zero. Note that the opposite is not true. In other words, if + there is no I/O queued, it does not mean that the cgroup is idle + (I/O-wise). It could be doing purely synchronous reads on an + otherwise quiescent device, which is therefore able to handle them + immediately, without queuing. Also, while it is helpful to figure + out which cgroup is putting stress on the I/O subsystem, keep in + mind that is is a relative quantity. Even if a process group does + not perform more I/O, its queue size can increase just because the + device load increases because of other devices. + +Network Metrics +--------------- + +Network metrics are not exposed directly by control groups. There is a +good explanation for that: network interfaces exist within the context +of *network namespaces*. The kernel could probably accumulate metrics +about packets and bytes sent and received by a group of processes, but +those metrics wouldn't be very useful. You want per-interface metrics +(because traffic happening on the local ``lo`` interface doesn't +really count). But since processes in a single cgroup can belong to +multiple network namespaces, those metrics would be harder to +interpret: multiple network namespaces means multiple ``lo`` +interfaces, potentially multiple ``eth0`` interfaces, etc.; so this is +why there is no easy way to gather network metrics with control +groups. + +Instead we can gather network metrics from other sources: + +IPtables +........ + +IPtables (or rather, the netfilter framework for which iptables is +just an interface) can do some serious accounting. + +For instance, you can setup a rule to account for the outbound HTTP +traffic on a web server: + +:: + + iptables -I OUTPUT -p tcp --sport 80 + + +There is no ``-j`` or ``-g`` flag, so the rule will just count matched +packets and go to the following rule. + +Later, you can check the values of the counters, with: + +:: + + iptables -nxvL OUTPUT + +Technically, ``-n`` is not required, but it will prevent iptables from +doing DNS reverse lookups, which are probably useless in this +scenario. + +Counters include packets and bytes. If you want to setup metrics for +container traffic like this, you could execute a ``for`` loop to add +two ``iptables`` rules per container IP address (one in each +direction), in the ``FORWARD`` chain. This will only meter traffic +going through the NAT layer; you will also have to add traffic going +through the userland proxy. + +Then, you will need to check those counters on a regular basis. If you +happen to use ``collectd``, there is a nice plugin to automate +iptables counters collection. + +Interface-level counters +........................ + +Since each container has a virtual Ethernet interface, you might want +to check directly the TX and RX counters of this interface. You will +notice that each container is associated to a virtual Ethernet +interface in your host, with a name like ``vethKk8Zqi``. Figuring out +which interface corresponds to which container is, unfortunately, +difficult. + +But for now, the best way is to check the metrics *from within the +containers*. To accomplish this, you can run an executable from the +host environment within the network namespace of a container using +**ip-netns magic**. + +The ``ip-netns exec`` command will let you execute any program +(present in the host system) within any network namespace visible to +the current process. This means that your host will be able to enter +the network namespace of your containers, but your containers won't be +able to access the host, nor their sibling containers. Containers will +be able to “see” and affect their sub-containers, though. + +The exact format of the command is:: + + ip netns exec + +For example:: + + ip netns exec mycontainer netstat -i + +``ip netns`` finds the "mycontainer" container by using namespaces +pseudo-files. Each process belongs to one network namespace, one PID +namespace, one ``mnt`` namespace, etc., and those namespaces are +materialized under ``/proc//ns/``. For example, the network +namespace of PID 42 is materialized by the pseudo-file +``/proc/42/ns/net``. + +When you run ``ip netns exec mycontainer ...``, it expects +``/var/run/netns/mycontainer`` to be one of those +pseudo-files. (Symlinks are accepted.) + +In other words, to execute a command within the network namespace of a +container, we need to: + +* Find out the PID of any process within the container that we want to + investigate; +* Create a symlink from ``/var/run/netns/`` to + ``/proc//ns/net`` +* Execute ``ip netns exec ....`` + +Please review :ref:`run_findpid` to learn how to find the cgroup of a +pprocess running in the container of which you want to measure network +usage. From there, you can examine the pseudo-file named ``tasks``, +which containes the PIDs that are in the control group (i.e. in the +container). Pick any one of them. + +Putting everything together, if the "short ID" of a container is held +in the environment variable ``$CID``, then you can do this:: + + TASKS=/sys/fs/cgroup/devices/$CID*/tasks + PID=$(head -n 1 $TASKS) + mkdir -p /var/run/netns + ln -sf /proc/$PID/ns/net /var/run/netns/$CID + ip netns exec $CID netstat -i + + +Tips for high-performance metric collection +------------------------------------------- + +Note that running a new process each time you want to update metrics +is (relatively) expensive. If you want to collect metrics at high +resolutions, and/or over a large number of containers (think 1000 +containers on a single host), you do not want to fork a new process +each time. + +Here is how to collect metrics from a single process. You will have to +write your metric collector in C (or any language that lets you do +low-level system calls). You need to use a special system call, +``setns()``, which lets the current process enter any arbitrary +namespace. It requires, however, an open file descriptor to the +namespace pseudo-file (remember: that’s the pseudo-file in +``/proc//ns/net``). + +However, there is a catch: you must not keep this file descriptor +open. If you do, when the last process of the control group exits, the +namespace will not be destroyed, and its network resources (like the +virtual interface of the container) will stay around for ever (or +until you close that file descriptor). + +The right approach would be to keep track of the first PID of each +container, and re-open the namespace pseudo-file each time. + +Collecting metrics when a container exits +----------------------------------------- + +Sometimes, you do not care about real time metric collection, but when +a container exits, you want to know how much CPU, memory, etc. it has +used. + +Docker makes this difficult because it relies on ``lxc-start``, which +carefully cleans up after itself, but it is still possible. It is +usually easier to collect metrics at regular intervals (e.g. every +minute, with the collectd LXC plugin) and rely on that instead. + +But, if you'd still like to gather the stats when a container stops, +here is how: + +For each container, start a collection process, and move it to the +control groups that you want to monitor by writing its PID to the +tasks file of the cgroup. The collection process should periodically +re-read the tasks file to check if it's the last process of the +control group. (If you also want to collect network statistics as +explained in the previous section, you should also move the process to +the appropriate network namespace.) + +When the container exits, ``lxc-start`` will try to delete the control +groups. It will fail, since the control group is still in use; but +that’s fine. You process should now detect that it is the only one +remaining in the group. Now is the right time to collect all the +metrics you need! + +Finally, your process should move itself back to the root control +group, and remove the container control group. To remove a control +group, just ``rmdir`` its directory. It's counter-intuitive to +``rmdir`` a directory as it still contains files; but remember that +this is a pseudo-filesystem, so usual rules don't apply. After the +cleanup is done, the collection process can exit safely. + diff --git a/docs/sources/installation/ubuntulinux.rst b/docs/sources/installation/ubuntulinux.rst index d5e4a248ba..3d6ee6415d 100644 --- a/docs/sources/installation/ubuntulinux.rst +++ b/docs/sources/installation/ubuntulinux.rst @@ -217,6 +217,15 @@ To install the latest version of docker, use the standard ``apt-get`` method: # install the latest sudo apt-get install lxc-docker +Troubleshooting +^^^^^^^^^^^^^^^ + +On Linux Mint, the ``cgroups-lite`` package is not installed by default. +Before Docker will work correctly, you will need to install this via: + +.. code-block:: bash + + sudo apt-get update && sudo apt-get install cgroups-lite .. _ufw: @@ -224,7 +233,7 @@ Docker and UFW ^^^^^^^^^^^^^^ Docker uses a bridge to manage container networking. By default, UFW drops all -`forwarding` traffic. As a result will you need to enable UFW forwarding: +`forwarding` traffic. As a result you will need to enable UFW forwarding: .. code-block:: bash diff --git a/docs/sources/reference/builder.rst b/docs/sources/reference/builder.rst index 45cb2ab86e..9889660913 100644 --- a/docs/sources/reference/builder.rst +++ b/docs/sources/reference/builder.rst @@ -1,12 +1,12 @@ -:title: Build Images (Dockerfile Reference) +:title: Dockerfile Reference :description: Dockerfiles use a simple DSL which allows you to automate the steps you would normally manually take to create an image. :keywords: builder, docker, Dockerfile, automation, image creation .. _dockerbuilder: -=================================== -Build Images (Dockerfile Reference) -=================================== +==================== +Dockerfile Reference +==================== **Docker can act as a builder** and read instructions from a text ``Dockerfile`` to automate the steps you would otherwise take manually diff --git a/docs/sources/reference/commandline/cli.rst b/docs/sources/reference/commandline/cli.rst index 3d215cc0b4..491688027f 100644 --- a/docs/sources/reference/commandline/cli.rst +++ b/docs/sources/reference/commandline/cli.rst @@ -18,6 +18,45 @@ To list available commands, either run ``docker`` with no parameters or execute ... +.. _cli_options: + +Types of Options +---------------- + +Boolean +~~~~~~~ + +Boolean options look like ``-d=false``. The value you see is the +default value which gets set if you do **not** use the boolean +flag. If you do call ``run -d``, that sets the opposite boolean value, +so in this case, ``true``, and so ``docker run -d`` **will** run in +"detached" mode, in the background. Other boolean options are similar +-- specifying them will set the value to the opposite of the default +value. + +Multi +~~~~~ + +Options like ``-a=[]`` indicate they can be specified multiple times:: + + docker run -a stdin -a stdout -a stderr -i -t ubuntu /bin/bash + +Sometimes this can use a more complex value string, as for ``-v``:: + + docker run -v /host:/container example/mysql + +Strings and Integers +~~~~~~~~~~~~~~~~~~~~ + +Options like ``-name=""`` expect a string, and they can only be +specified once. Options like ``-c=0`` expect an integer, and they can +only be specified once. + +---- + +Commands +-------- + .. _cli_daemon: ``daemon`` diff --git a/docs/sources/reference/index.rst b/docs/sources/reference/index.rst index 49099d5621..d35a19b93d 100644 --- a/docs/sources/reference/index.rst +++ b/docs/sources/reference/index.rst @@ -14,4 +14,5 @@ Contents: commandline/index builder + run api/index diff --git a/docs/sources/reference/run.rst b/docs/sources/reference/run.rst new file mode 100644 index 0000000000..307edace00 --- /dev/null +++ b/docs/sources/reference/run.rst @@ -0,0 +1,419 @@ +:title: Docker Run Reference +:description: Configure containers at runtime +:keywords: docker, run, configure, runtime + +.. _run_docker: + +==================== +Docker Run Reference +==================== + +**Docker runs processes in isolated containers**. When an operator +executes ``docker run``, she starts a process with its own file +system, its own networking, and its own isolated process tree. The +:ref:`image_def` which starts the process may define defaults related +to the binary to run, the networking to expose, and more, but ``docker +run`` gives final control to the operator who starts the container +from the image. That's the main reason :ref:`cli_run` has more options +than any other ``docker`` command. + +Every one of the :ref:`example_list` shows running containers, and so +here we try to give more in-depth guidance. + +.. contents:: Table of Contents + :depth: 2 + +.. _run_running: + +General Form +============ + +As you've seen in the :ref:`example_list`, the basic `run` command +takes this form:: + + docker run [OPTIONS] IMAGE[:TAG] [COMMAND] [ARG...] + +To learn how to interpret the types of ``[OPTIONS]``, see +:ref:`cli_options`. + +The list of ``[OPTIONS]`` breaks down into two groups: + +1. Settings exclusive to operators, including: + + * Detached or Foreground running, + * Container Identification, + * Network settings, and + * Runtime Constraints on CPU and Memory + * Privileges and LXC Configuration + +2. Setting shared between operators and developers, where operators + can override defaults developers set in images at build time. + +Together, the ``docker run [OPTIONS]`` give complete control over +runtime behavior to the operator, allowing them to override all +defaults set by the developer during ``docker build`` and nearly all +the defaults set by the Docker runtime itself. + +Operator Exclusive Options +========================== + +Only the operator (the person executing ``docker run``) can set the +following options. + +.. contents:: + :local: + +Detached vs Foreground +---------------------- + +When starting a Docker container, you must first decide if you want to +run the container in the background in a "detached" mode or in the +default foreground mode:: + + -d=false: Detached mode: Run container in the background, print new container id + +Detached (-d) +............. + +In detached mode (``-d=true`` or just ``-d``), all I/O should be done +through network connections or shared volumes because the container is +no longer listening to the commandline where you executed ``docker +run``. You can reattach to a detached container with ``docker`` +:ref:`cli_attach`. If you choose to run a container in the detached +mode, then you cannot use the ``-rm`` option. + +Foreground +.......... + +In foreground mode (the default when ``-d`` is not specified), +``docker run`` can start the process in the container and attach the +console to the process's standard input, output, and standard +error. It can even pretend to be a TTY (this is what most commandline +executables expect) and pass along signals. All of that is +configurable:: + + -a=[] : Attach to ``stdin``, ``stdout`` and/or ``stderr`` + -t=false : Allocate a pseudo-tty + -sig-proxy=true: Proxify all received signal to the process (even in non-tty mode) + -i=false : Keep STDIN open even if not attached + +If you do not specify ``-a`` then Docker will `attach everything +(stdin,stdout,stderr) +`_. You +can specify to which of the three standard streams (``stdin``, ``stdout``, +``stderr``) you'd like to connect instead, as in:: + + docker run -a stdin -a stdout -i -t ubuntu /bin/bash + +For interactive processes (like a shell) you will typically want a tty +as well as persistent standard input (``stdin``), so you'll use ``-i +-t`` together in most interactive cases. + +Container Identification +------------------------ + +Name (-name) +............ + +The operator can identify a container in three ways: + +* UUID long identifier ("f78375b1c487e03c9438c729345e54db9d20cfa2ac1fc3494b6eb60872e74778") +* UUID short identifier ("f78375b1c487") +* Name ("evil_ptolemy") + +The UUID identifiers come from the Docker daemon, and if you do not +assign a name to the container with ``-name`` then the daemon will +also generate a random string name too. The name can become a handy +way to add meaning to a container since you can use this name when +defining :ref:`links ` (or any other place +you need to identify a container). This works for both background and +foreground Docker containers. + +PID Equivalent +.............. + +And finally, to help with automation, you can have Docker write the +container ID out to a file of your choosing. This is similar to how +some programs might write out their process ID to a file (you've seen +them as PID files):: + + -cidfile="": Write the container ID to the file + +Network Settings +---------------- + +:: + -n=true : Enable networking for this container + -dns=[] : Set custom dns servers for the container + +By default, all containers have networking enabled and they can make +any outgoing connections. The operator can completely disable +networking with ``docker run -n`` which disables all incoming and outgoing +networking. In cases like this, you would perform I/O through files or +STDIN/STDOUT only. + +Your container will use the same DNS servers as the host by default, +but you can override this with ``-dns``. + +Clean Up (-rm) +-------------- + +By default a container's file system persists even after the container +exits. This makes debugging a lot easier (since you can inspect the +final state) and you retain all your data by default. But if you are +running short-term **foreground** processes, these container file +systems can really pile up. If instead you'd like Docker to +**automatically clean up the container and remove the file system when +the container exits**, you can add the ``-rm`` flag:: + + -rm=false: Automatically remove the container when it exits (incompatible with -d) + + +Runtime Constraints on CPU and Memory +------------------------------------- + +The operator can also adjust the performance parameters of the container:: + + -m="": Memory limit (format: , where unit = b, k, m or g) + -c=0 : CPU shares (relative weight) + +The operator can constrain the memory available to a container easily +with ``docker run -m``. If the host supports swap memory, then the +``-m`` memory setting can be larger than physical RAM. + +Similarly the operator can increase the priority of this container +with the ``-c`` option. By default, all containers run at the same +priority and get the same proportion of CPU cycles, but you can tell +the kernel to give more shares of CPU time to one or more containers +when you start them via Docker. + +Runtime Privilege and LXC Configuration +--------------------------------------- + +:: + + -privileged=false: Give extended privileges to this container + -lxc-conf=[]: Add custom lxc options -lxc-conf="lxc.cgroup.cpuset.cpus = 0,1" + +By default, Docker containers are "unprivileged" and cannot, for +example, run a Docker daemon inside a Docker container. This is +because by default a container is not allowed to access any devices, +but a "privileged" container is given access to all devices (see +lxc-template.go_ and documentation on `cgroups devices +`_). + +When the operator executes ``docker run -privileged``, Docker will +enable to access to all devices on the host as well as set some +configuration in AppArmor to allow the container nearly all the same +access to the host as processes running outside containers on the +host. Additional information about running with ``-privileged`` is +available on the `Docker Blog +`_. + +An operator can also specify LXC options using one or more +``-lxc-conf`` parameters. These can be new parameters or override +existing parameters from the lxc-template.go_. Note that in the +future, a given host's Docker daemon may not use LXC, so this is an +implementation-specific configuration meant for operators already +familiar with using LXC directly. + +.. _lxc-template.go: https://github.com/dotcloud/docker/blob/master/execdriver/lxc/lxc_template.go + + +Overriding ``Dockerfile`` Image Defaults +======================================== + +When a developer builds an image from a :ref:`Dockerfile +` or when she commits it, the developer can set a +number of default parameters that take effect when the image starts up +as a container. + +Four of the ``Dockerfile`` commands cannot be overridden at runtime: +``FROM, MAINTAINER, RUN``, and ``ADD``. Everything else has a +corresponding override in ``docker run``. We'll go through what the +developer might have set in each ``Dockerfile`` instruction and how the +operator can override that setting. + +.. contents:: + :local: + +CMD (Default Command or Options) +-------------------------------- + +Recall the optional ``COMMAND`` in the Docker commandline:: + + docker run [OPTIONS] IMAGE[:TAG] [COMMAND] [ARG...] + +This command is optional because the person who created the ``IMAGE`` +may have already provided a default ``COMMAND`` using the ``Dockerfile`` +``CMD``. As the operator (the person running a container from the +image), you can override that ``CMD`` just by specifying a new +``COMMAND``. + +If the image also specifies an ``ENTRYPOINT`` then the ``CMD`` or +``COMMAND`` get appended as arguments to the ``ENTRYPOINT``. + + +ENTRYPOINT (Default Command to Execute at Runtime +------------------------------------------------- + +:: + + -entrypoint="": Overwrite the default entrypoint set by the image + +The ENTRYPOINT of an image is similar to a ``COMMAND`` because it +specifies what executable to run when the container starts, but it is +(purposely) more difficult to override. The ``ENTRYPOINT`` gives a +container its default nature or behavior, so that when you set an +``ENTRYPOINT`` you can run the container *as if it were that binary*, +complete with default options, and you can pass in more options via +the ``COMMAND``. But, sometimes an operator may want to run something else +inside the container, so you can override the default ``ENTRYPOINT`` at +runtime by using a string to specify the new ``ENTRYPOINT``. Here is an +example of how to run a shell in a container that has been set up to +automatically run something else (like ``/usr/bin/redis-server``):: + + docker run -i -t -entrypoint /bin/bash example/redis + +or two examples of how to pass more parameters to that ENTRYPOINT:: + + docker run -i -t -entrypoint /bin/bash example/redis -c ls -l + docker run -i -t -entrypoint /usr/bin/redis-cli example/redis --help + + +EXPOSE (Incoming Ports) +----------------------- + +The ``Dockerfile`` doesn't give much control over networking, only +providing the ``EXPOSE`` instruction to give a hint to the operator +about what incoming ports might provide services. The following +options work with or override the ``Dockerfile``'s exposed defaults:: + + -expose=[]: Expose a port from the container + without publishing it to your host + -P=false : Publish all exposed ports to the host interfaces + -p=[] : Publish a container's port to the host (format: + ip:hostPort:containerPort | ip::containerPort | + hostPort:containerPort) + (use 'docker port' to see the actual mapping) + -link="" : Add link to another container (name:alias) + +As mentioned previously, ``EXPOSE`` (and ``-expose``) make a port +available **in** a container for incoming connections. The port number +on the inside of the container (where the service listens) does not +need to be the same number as the port exposed on the outside of the +container (where clients connect), so inside the container you might +have an HTTP service listening on port 80 (and so you ``EXPOSE 80`` in +the ``Dockerfile``), but outside the container the port might be 42800. + +To help a new client container reach the server container's internal +port operator ``-expose``'d by the operator or ``EXPOSE``'d by the +developer, the operator has three choices: start the server container +with ``-P`` or ``-p,`` or start the client container with ``-link``. + +If the operator uses ``-P`` or ``-p`` then Docker will make the +exposed port accessible on the host and the ports will be available to +any client that can reach the host. To find the map between the host +ports and the exposed ports, use ``docker port``) + +If the operator uses ``-link`` when starting the new client container, +then the client container can access the exposed port via a private +networking interface. Docker will set some environment variables in +the client container to help indicate which interface and port to use. + +ENV (Environment Variables) +--------------------------- + +The operator can **set any environment variable** in the container by +using one or more ``-e`` flags, even overriding those already defined by the +developer with a Dockefile ``ENV``:: + + $ docker run -e "deep=purple" -rm ubuntu /bin/bash -c export + declare -x HOME="/" + declare -x HOSTNAME="85bc26a0e200" + declare -x OLDPWD + declare -x PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + declare -x PWD="/" + declare -x SHLVL="1" + declare -x container="lxc" + declare -x deep="purple" + +Similarly the operator can set the **hostname** with ``-h``. + +``-link name:alias`` also sets environment variables, using the +*alias* string to define environment variables within the container +that give the IP and PORT information for connecting to the service +container. Let's imagine we have a container running Redis:: + + # Start the service container, named redis-name + $ docker run -d -name redis-name dockerfiles/redis + 4241164edf6f5aca5b0e9e4c9eccd899b0b8080c64c0cd26efe02166c73208f3 + + # The redis-name container exposed port 6379 + $ docker ps + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + 4241164edf6f dockerfiles/redis:latest /redis-stable/src/re 5 seconds ago Up 4 seconds 6379/tcp redis-name + + # Note that there are no public ports exposed since we didn't use -p or -P + $ docker port 4241164edf6f 6379 + 2014/01/25 00:55:38 Error: No public port '6379' published for 4241164edf6f + + +Yet we can get information about the Redis container's exposed ports +with ``-link``. Choose an alias that will form a valid environment +variable! + +:: + + $ docker run -rm -link redis-name:redis_alias -entrypoint /bin/bash dockerfiles/redis -c export + declare -x HOME="/" + declare -x HOSTNAME="acda7f7b1cdc" + declare -x OLDPWD + declare -x PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + declare -x PWD="/" + declare -x REDIS_ALIAS_NAME="/distracted_wright/redis" + declare -x REDIS_ALIAS_PORT="tcp://172.17.0.32:6379" + declare -x REDIS_ALIAS_PORT_6379_TCP="tcp://172.17.0.32:6379" + declare -x REDIS_ALIAS_PORT_6379_TCP_ADDR="172.17.0.32" + declare -x REDIS_ALIAS_PORT_6379_TCP_PORT="6379" + declare -x REDIS_ALIAS_PORT_6379_TCP_PROTO="tcp" + declare -x SHLVL="1" + declare -x container="lxc" + +And we can use that information to connect from another container as a client:: + + $ docker run -i -t -rm -link redis-name:redis_alias -entrypoint /bin/bash dockerfiles/redis -c '/redis-stable/src/redis-cli -h $REDIS_ALIAS_PORT_6379_TCP_ADDR -p $REDIS_ALIAS_PORT_6379_TCP_PORT' + 172.17.0.32:6379> + +VOLUME (Shared Filesystems) +--------------------------- + +:: + + -v=[]: Create a bind mount with: [host-dir]:[container-dir]:[rw|ro]. + If "container-dir" is missing, then docker creates a new volume. + -volumes-from="": Mount all volumes from the given container(s) + +The volumes commands are complex enough to have their own +documentation in section :ref:`volume_def`. A developer can define one +or more ``VOLUME``\s associated with an image, but only the operator can +give access from one container to another (or from a container to a +volume mounted on the host). + +USER +---- + +The default user within a container is ``root`` (id = 0), but if the +developer created additional users, those are accessible too. The +developer can set a default user to run the first process with the +``Dockerfile USER`` command, but the operator can override it :: + + -u="": Username or UID + +WORKDIR +------- + +The default working directory for running binaries within a container is the root directory (``/``), but the developer can set a different default with the ``Dockerfile WORKDIR`` command. The operator can override this with:: + + -w="": Working directory inside the container + diff --git a/engine/env.go b/engine/env.go index f30e135555..ce8c34bb24 100644 --- a/engine/env.go +++ b/engine/env.go @@ -60,7 +60,7 @@ func (env *Env) GetInt64(key string) int64 { s := strings.Trim(env.Get(key), " \t") val, err := strconv.ParseInt(s, 10, 64) if err != nil { - return -1 + return 0 } return val } @@ -213,24 +213,6 @@ func (env *Env) WriteTo(dst io.Writer) (n int64, err error) { return 0, env.Encode(dst) } -func (env *Env) Export(dst interface{}) (err error) { - defer func() { - if err != nil { - err = fmt.Errorf("ExportEnv %s", err) - } - }() - var buf bytes.Buffer - // step 1: encode/marshal the env to an intermediary json representation - if err := env.Encode(&buf); err != nil { - return err - } - // step 2: decode/unmarshal the intermediary json into the destination object - if err := json.NewDecoder(&buf).Decode(dst); err != nil { - return err - } - return nil -} - func (env *Env) Import(src interface{}) (err error) { defer func() { if err != nil { diff --git a/engine/env_test.go b/engine/env_test.go index 24c5992dd0..c7079ff942 100644 --- a/engine/env_test.go +++ b/engine/env_test.go @@ -62,7 +62,7 @@ func TestSetenvInt(t *testing.T) { if val := job.GetenvInt("bar"); val != 42 { t.Fatalf("GetenvInt returns incorrect value: %d", val) } - if val := job.GetenvInt("nonexistent"); val != -1 { + if val := job.GetenvInt("nonexistent"); val != 0 { t.Fatalf("GetenvInt returns incorrect value: %d", val) } } @@ -84,32 +84,6 @@ func TestSetenvList(t *testing.T) { } } -func TestImportEnv(t *testing.T) { - type dummy struct { - DummyInt int - DummyStringArray []string - } - - job := mkJob(t, "dummy") - if err := job.ImportEnv(&dummy{42, []string{"foo", "bar"}}); err != nil { - t.Fatal(err) - } - - dmy := dummy{} - if err := job.ExportEnv(&dmy); err != nil { - t.Fatal(err) - } - - if dmy.DummyInt != 42 { - t.Fatalf("Expected 42, got %d", dmy.DummyInt) - } - - if len(dmy.DummyStringArray) != 2 || dmy.DummyStringArray[0] != "foo" || dmy.DummyStringArray[1] != "bar" { - t.Fatalf("Expected {foo, bar}, got %v", dmy.DummyStringArray) - } - -} - func TestEnviron(t *testing.T) { job := mkJob(t, "dummy") job.Setenv("foo", "bar") diff --git a/engine/job.go b/engine/job.go index 5447441beb..1f35ac85ff 100644 --- a/engine/job.go +++ b/engine/job.go @@ -102,6 +102,10 @@ func (job *Job) String() string { return fmt.Sprintf("%s.%s%s", job.Eng, job.CallString(), job.StatusString()) } +func (job *Job) EnvExists(key string) (value bool) { + return job.env.Exists(key) +} + func (job *Job) Getenv(key string) (value string) { return job.env.Get(key) } @@ -172,10 +176,6 @@ func (job *Job) EncodeEnv(dst io.Writer) error { return job.env.Encode(dst) } -func (job *Job) ExportEnv(dst interface{}) (err error) { - return job.env.Export(dst) -} - func (job *Job) ImportEnv(src interface{}) (err error) { return job.env.Import(src) } diff --git a/execdriver/lxc/driver.go b/execdriver/lxc/driver.go index 4e8f586f82..11ee3b283f 100644 --- a/execdriver/lxc/driver.go +++ b/execdriver/lxc/driver.go @@ -155,7 +155,9 @@ func (d *driver) Run(c *execdriver.Command, startCallback execdriver.StartCallba ) go func() { if err := c.Wait(); err != nil { - waitErr = err + if _, ok := err.(*exec.ExitError); !ok { // Do not propagate the error if it's simply a status code != 0 + waitErr = err + } } close(waitLock) }() diff --git a/graphdriver/btrfs/btrfs.go b/graphdriver/btrfs/btrfs.go new file mode 100644 index 0000000000..e8dc6bd0e9 --- /dev/null +++ b/graphdriver/btrfs/btrfs.go @@ -0,0 +1,217 @@ +// +build linux + +package btrfs + +/* +#include +#include +#include +#include +#include +#include +#include + +*/ +import "C" +import ( + "fmt" + "github.com/dotcloud/docker/graphdriver" + "os" + "path" + "syscall" + "unsafe" +) + +func init() { + graphdriver.Register("btrfs", Init) +} + +func Init(home string) (graphdriver.Driver, error) { + rootdir := path.Dir(home) + + var buf syscall.Statfs_t + if err := syscall.Statfs(rootdir, &buf); err != nil { + return nil, err + } + + if buf.Type != 0x9123683E { + return nil, fmt.Errorf("%s is not a btrfs filesystem", rootdir) + } + + return &Driver{ + home: home, + }, nil +} + +type Driver struct { + home string +} + +func (d *Driver) String() string { + return "btrfs" +} + +func (d *Driver) Status() [][2]string { + return nil +} + +func (d *Driver) Cleanup() error { + return nil +} + +func free(p *C.char) { + C.free(unsafe.Pointer(p)) +} + +func openDir(path string) (*C.DIR, error) { + Cpath := C.CString(path) + defer free(Cpath) + + dir := C.opendir(Cpath) + if dir == nil { + return nil, fmt.Errorf("Can't open dir") + } + return dir, nil +} + +func closeDir(dir *C.DIR) { + if dir != nil { + C.closedir(dir) + } +} + +func getDirFd(dir *C.DIR) uintptr { + return uintptr(C.dirfd(dir)) +} + +func subvolCreate(path, name string) error { + dir, err := openDir(path) + if err != nil { + return err + } + defer closeDir(dir) + + var args C.struct_btrfs_ioctl_vol_args + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SUBVOL_CREATE, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Can't create subvolume") + } + return nil +} + +func subvolSnapshot(src, dest, name string) error { + srcDir, err := openDir(src) + if err != nil { + return err + } + defer closeDir(srcDir) + + destDir, err := openDir(dest) + if err != nil { + return err + } + defer closeDir(destDir) + + var args C.struct_btrfs_ioctl_vol_args_v2 + args.fd = C.__s64(getDirFd(srcDir)) + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(destDir), C.BTRFS_IOC_SNAP_CREATE_V2, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Can't create subvolume") + } + return nil +} + +func subvolDelete(path, name string) error { + dir, err := openDir(path) + if err != nil { + return err + } + defer closeDir(dir) + + var args C.struct_btrfs_ioctl_vol_args + for i, c := range []byte(name) { + args.name[i] = C.char(c) + } + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SNAP_DESTROY, + uintptr(unsafe.Pointer(&args))) + if errno != 0 { + return fmt.Errorf("Can't create subvolume") + } + return nil +} + +func (d *Driver) subvolumesDir() string { + return path.Join(d.home, "subvolumes") +} + +func (d *Driver) subvolumesDirId(id string) string { + return path.Join(d.subvolumesDir(), id) +} + +func (d *Driver) Create(id string, parent string) error { + subvolumes := path.Join(d.home, "subvolumes") + if err := os.MkdirAll(subvolumes, 0700); err != nil { + return err + } + if parent == "" { + if err := subvolCreate(subvolumes, id); err != nil { + return err + } + } else { + parentDir, err := d.Get(parent) + if err != nil { + return err + } + if err := subvolSnapshot(parentDir, subvolumes, id); err != nil { + return err + } + } + return nil +} + +func (d *Driver) Remove(id string) error { + dir := d.subvolumesDirId(id) + if _, err := os.Stat(dir); err != nil { + return err + } + if err := subvolDelete(d.subvolumesDir(), id); err != nil { + return err + } + return os.RemoveAll(dir) +} + +func (d *Driver) Get(id string) (string, error) { + dir := d.subvolumesDirId(id) + st, err := os.Stat(dir) + if err != nil { + return "", err + } + + if !st.IsDir() { + return "", fmt.Errorf("%s: not a directory", dir) + } + + return dir, nil +} + +func (d *Driver) Put(id string) { + // Get() creates no runtime resources (like e.g. mounts) + // so this doesn't need to do anything. +} + +func (d *Driver) Exists(id string) bool { + dir := d.subvolumesDirId(id) + _, err := os.Stat(dir) + return err == nil +} diff --git a/graphdriver/btrfs/dummy_unsupported.go b/graphdriver/btrfs/dummy_unsupported.go new file mode 100644 index 0000000000..5efd18081f --- /dev/null +++ b/graphdriver/btrfs/dummy_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package btrfs diff --git a/graphdriver/driver.go b/graphdriver/driver.go index 2be3f05f3a..c0ed00b0ad 100644 --- a/graphdriver/driver.go +++ b/graphdriver/driver.go @@ -41,6 +41,8 @@ var ( "aufs", "devicemapper", "vfs", + // experimental, has to be enabled manually for now + "btrfs", } ) diff --git a/hack/install.sh b/hack/install.sh index 02d812f388..65e34f9659 100755 --- a/hack/install.sh +++ b/hack/install.sh @@ -37,8 +37,10 @@ if command_exists docker || command_exists lxc-docker; then ( set -x; sleep 20 ) fi +user="$(id -un 2>/dev/null || true)" + sh_c='sh -c' -if [ "$(whoami 2>/dev/null || true)" != 'root' ]; then +if [ "$user" != 'root' ]; then if command_exists sudo; then sh_c='sudo sh -c' elif command_exists su; then @@ -124,6 +126,16 @@ case "$lsb_dist" in $sh_c 'docker run busybox echo "Docker has been successfully installed!"' ) || true fi + your_user=your-user + [ "$user" != 'root' ] && your_user="$user" + echo + echo 'If you would like to use Docker as a non-root user, you should now consider' + echo 'adding your user to the "docker" group with something like:' + echo + echo ' sudo usermod -aG docker' $your_user + echo + echo 'Remember that you will have to log out and back in for this to take effect!' + echo exit 0 ;; diff --git a/hack/make.sh b/hack/make.sh index 6029c9ec10..ef13c1a283 100755 --- a/hack/make.sh +++ b/hack/make.sh @@ -25,12 +25,18 @@ set -o pipefail # We're a nice, sexy, little shell script, and people might try to run us; # but really, they shouldn't. We want to be in a container! -RESOLVCONF=$(readlink --canonicalize /etc/resolv.conf) -grep -q "$RESOLVCONF" /proc/mounts || { - echo >&2 "# WARNING! I don't seem to be running in a docker container." - echo >&2 "# The result of this command might be an incorrect build, and will not be officially supported." - echo >&2 "# Try this: 'make all'" -} +if [ "$(pwd)" != '/go/src/github.com/dotcloud/docker' ] || [ -z "$DOCKER_CROSSPLATFORMS" ]; then + { + echo "# WARNING! I don't seem to be running in the Docker container." + echo "# The result of this command might be an incorrect build, and will not be" + echo "# officially supported." + echo "#" + echo "# Try this instead: make all" + echo "#" + } >&2 +fi + +echo # List of bundles to create when no argument is passed DEFAULT_BUNDLES=( diff --git a/hack/travis/dco.py b/hack/travis/dco.py index d80d528f9a..f873940815 100755 --- a/hack/travis/dco.py +++ b/hack/travis/dco.py @@ -5,7 +5,7 @@ import yaml from env import commit_range -commit_format = '-%n hash: "%h"%n author: %aN <%aE>%n message: |%n%w(0,2,2)%B' +commit_format = '-%n hash: "%h"%n author: %aN <%aE>%n message: |%n%w(0,2,2).%B' gitlog = subprocess.check_output([ 'git', 'log', '--reverse', @@ -24,6 +24,11 @@ p = re.compile(r'^{0} ([^<]+) <([^<>@]+@[^<>]+)> \(github: (\S+)\)$'.format(re.e failed_commits = 0 for commit in commits: + commit['message'] = commit['message'][1:] + # trim off our '.' that exists just to prevent fun YAML parsing issues + # see https://github.com/dotcloud/docker/pull/3836#issuecomment-33723094 + # and https://travis-ci.org/dotcloud/docker/builds/17926783 + commit['stat'] = subprocess.check_output([ 'git', 'log', '--format=format:', '--max-count=1', '--name-status', commit['hash'], '--', diff --git a/integration/server_test.go b/integration/server_test.go index 2666d1d4fe..b0ad3d903b 100644 --- a/integration/server_test.go +++ b/integration/server_test.go @@ -114,6 +114,30 @@ func TestCreateRm(t *testing.T) { } +func TestCreateNumberHostname(t *testing.T) { + eng := NewTestEngine(t) + defer mkRuntimeFromEngine(eng, t).Nuke() + + config, _, _, err := docker.ParseRun([]string{"-h", "web.0", unitTestImageID, "echo test"}, nil) + if err != nil { + t.Fatal(err) + } + + createTestContainer(eng, config, t) +} + +func TestCreateNumberUsername(t *testing.T) { + eng := NewTestEngine(t) + defer mkRuntimeFromEngine(eng, t).Nuke() + + config, _, _, err := docker.ParseRun([]string{"-u", "1002", unitTestImageID, "echo test"}, nil) + if err != nil { + t.Fatal(err) + } + + createTestContainer(eng, config, t) +} + func TestCreateRmVolumes(t *testing.T) { eng := NewTestEngine(t) defer mkRuntimeFromEngine(eng, t).Nuke() diff --git a/network.go b/network.go index 250f7b594f..d9771ac008 100644 --- a/network.go +++ b/network.go @@ -5,9 +5,9 @@ import ( "github.com/dotcloud/docker/networkdriver" "github.com/dotcloud/docker/networkdriver/ipallocator" "github.com/dotcloud/docker/networkdriver/portallocator" + "github.com/dotcloud/docker/networkdriver/portmapper" "github.com/dotcloud/docker/pkg/iptables" "github.com/dotcloud/docker/pkg/netlink" - "github.com/dotcloud/docker/proxy" "github.com/dotcloud/docker/utils" "io/ioutil" "log" @@ -159,129 +159,6 @@ func getIfaceAddr(name string) (net.Addr, error) { return addrs4[0], nil } -// Port mapper takes care of mapping external ports to containers by setting -// up iptables rules. -// It keeps track of all mappings and is able to unmap at will -type PortMapper struct { - tcpMapping map[string]*net.TCPAddr - tcpProxies map[string]proxy.Proxy - udpMapping map[string]*net.UDPAddr - udpProxies map[string]proxy.Proxy - - iptables *iptables.Chain - defaultIp net.IP - proxyFactoryFunc func(net.Addr, net.Addr) (proxy.Proxy, error) -} - -func (mapper *PortMapper) Map(ip net.IP, port int, backendAddr net.Addr) error { - - if _, isTCP := backendAddr.(*net.TCPAddr); isTCP { - mapKey := (&net.TCPAddr{Port: port, IP: ip}).String() - if _, exists := mapper.tcpProxies[mapKey]; exists { - return fmt.Errorf("TCP Port %s is already in use", mapKey) - } - backendPort := backendAddr.(*net.TCPAddr).Port - backendIP := backendAddr.(*net.TCPAddr).IP - if mapper.iptables != nil { - if err := mapper.iptables.Forward(iptables.Add, ip, port, "tcp", backendIP.String(), backendPort); err != nil { - return err - } - } - mapper.tcpMapping[mapKey] = backendAddr.(*net.TCPAddr) - proxy, err := mapper.proxyFactoryFunc(&net.TCPAddr{IP: ip, Port: port}, backendAddr) - if err != nil { - mapper.Unmap(ip, port, "tcp") - return err - } - mapper.tcpProxies[mapKey] = proxy - go proxy.Run() - } else { - mapKey := (&net.UDPAddr{Port: port, IP: ip}).String() - if _, exists := mapper.udpProxies[mapKey]; exists { - return fmt.Errorf("UDP: Port %s is already in use", mapKey) - } - backendPort := backendAddr.(*net.UDPAddr).Port - backendIP := backendAddr.(*net.UDPAddr).IP - if mapper.iptables != nil { - if err := mapper.iptables.Forward(iptables.Add, ip, port, "udp", backendIP.String(), backendPort); err != nil { - return err - } - } - mapper.udpMapping[mapKey] = backendAddr.(*net.UDPAddr) - proxy, err := mapper.proxyFactoryFunc(&net.UDPAddr{IP: ip, Port: port}, backendAddr) - if err != nil { - mapper.Unmap(ip, port, "udp") - return err - } - mapper.udpProxies[mapKey] = proxy - go proxy.Run() - } - return nil -} - -func (mapper *PortMapper) Unmap(ip net.IP, port int, proto string) error { - if proto == "tcp" { - mapKey := (&net.TCPAddr{Port: port, IP: ip}).String() - backendAddr, ok := mapper.tcpMapping[mapKey] - if !ok { - return fmt.Errorf("Port tcp/%s is not mapped", mapKey) - } - if proxy, exists := mapper.tcpProxies[mapKey]; exists { - proxy.Close() - delete(mapper.tcpProxies, mapKey) - } - if mapper.iptables != nil { - if err := mapper.iptables.Forward(iptables.Delete, ip, port, proto, backendAddr.IP.String(), backendAddr.Port); err != nil { - return err - } - } - delete(mapper.tcpMapping, mapKey) - } else { - mapKey := (&net.UDPAddr{Port: port, IP: ip}).String() - backendAddr, ok := mapper.udpMapping[mapKey] - if !ok { - return fmt.Errorf("Port udp/%s is not mapped", mapKey) - } - if proxy, exists := mapper.udpProxies[mapKey]; exists { - proxy.Close() - delete(mapper.udpProxies, mapKey) - } - if mapper.iptables != nil { - if err := mapper.iptables.Forward(iptables.Delete, ip, port, proto, backendAddr.IP.String(), backendAddr.Port); err != nil { - return err - } - } - delete(mapper.udpMapping, mapKey) - } - return nil -} - -func newPortMapper(config *DaemonConfig) (*PortMapper, error) { - // We can always try removing the iptables - if err := iptables.RemoveExistingChain("DOCKER"); err != nil { - return nil, err - } - var chain *iptables.Chain - if config.EnableIptables { - var err error - chain, err = iptables.NewChain("DOCKER", config.BridgeIface) - if err != nil { - return nil, fmt.Errorf("Failed to create DOCKER chain: %s", err) - } - } - - mapper := &PortMapper{ - tcpMapping: make(map[string]*net.TCPAddr), - tcpProxies: make(map[string]proxy.Proxy), - udpMapping: make(map[string]*net.UDPAddr), - udpProxies: make(map[string]proxy.Proxy), - iptables: chain, - defaultIp: config.DefaultIp, - proxyFactoryFunc: proxy.NewProxy, - } - return mapper, nil -} - // Network interface represents the networking stack of a container type NetworkInterface struct { IPNet net.IPNet @@ -299,7 +176,7 @@ func (iface *NetworkInterface) AllocatePort(port Port, binding PortBinding) (*Na return nil, fmt.Errorf("Trying to allocate port for interface %v, which is disabled", iface) // FIXME } - ip := iface.manager.portMapper.defaultIp + ip := iface.manager.defaultBindingIP if binding.HostIp != "" { ip = net.ParseIP(binding.HostIp) @@ -331,7 +208,7 @@ func (iface *NetworkInterface) AllocatePort(port Port, binding PortBinding) (*Na backend = &net.UDPAddr{IP: iface.IPNet.IP, Port: containerPort} } - if err := iface.manager.portMapper.Map(ip, extPort, backend); err != nil { + if err := portmapper.Map(backend, ip, extPort); err != nil { portallocator.ReleasePort(ip, nat.Port.Proto(), extPort) return nil, err } @@ -365,7 +242,15 @@ func (iface *NetworkInterface) Release() { } ip := net.ParseIP(nat.Binding.HostIp) utils.Debugf("Unmaping %s/%s:%s", nat.Port.Proto, ip.String(), nat.Binding.HostPort) - if err := iface.manager.portMapper.Unmap(ip, hostPort, nat.Port.Proto()); err != nil { + + var host net.Addr + if nat.Port.Proto() == "tcp" { + host = &net.TCPAddr{IP: ip, Port: hostPort} + } else { + host = &net.UDPAddr{IP: ip, Port: hostPort} + } + + if err := portmapper.Unmap(host); err != nil { log.Printf("Unable to unmap port %s: %s", nat, err) } @@ -382,12 +267,10 @@ func (iface *NetworkInterface) Release() { // Network Manager manages a set of network interfaces // Only *one* manager per host machine should be used type NetworkManager struct { - bridgeIface string - bridgeNetwork *net.IPNet - - portMapper *PortMapper - - disabled bool + bridgeIface string + bridgeNetwork *net.IPNet + defaultBindingIP net.IP + disabled bool } // Allocate a network interface @@ -444,7 +327,7 @@ func newNetworkManager(config *DaemonConfig) (*NetworkManager, error) { natArgs := []string{"POSTROUTING", "-t", "nat", "-s", addr.String(), "!", "-d", addr.String(), "-j", "MASQUERADE"} if !iptables.Exists(natArgs...) { - if output, err := iptables.Raw(append([]string{"-A"}, natArgs...)...); err != nil { + if output, err := iptables.Raw(append([]string{"-I"}, natArgs...)...); err != nil { return nil, fmt.Errorf("Unable to enable network bridge NAT: %s", err) } else if len(output) != 0 { return nil, fmt.Errorf("Error iptables postrouting: %s", output) @@ -508,16 +391,23 @@ func newNetworkManager(config *DaemonConfig) (*NetworkManager, error) { } } - portMapper, err := newPortMapper(config) - if err != nil { + // We can always try removing the iptables + if err := iptables.RemoveExistingChain("DOCKER"); err != nil { return nil, err } - manager := &NetworkManager{ - bridgeIface: config.BridgeIface, - bridgeNetwork: network, - portMapper: portMapper, + if config.EnableIptables { + chain, err := iptables.NewChain("DOCKER", config.BridgeIface) + if err != nil { + return nil, err + } + portmapper.SetIptablesChain(chain) } + manager := &NetworkManager{ + bridgeIface: config.BridgeIface, + bridgeNetwork: network, + defaultBindingIP: config.DefaultIp, + } return manager, nil } diff --git a/network_test.go b/network_test.go deleted file mode 100644 index 6cdf50ab6e..0000000000 --- a/network_test.go +++ /dev/null @@ -1,72 +0,0 @@ -package docker - -import ( - "github.com/dotcloud/docker/pkg/iptables" - "github.com/dotcloud/docker/proxy" - "net" - "testing" -) - -type StubProxy struct { - frontendAddr *net.Addr - backendAddr *net.Addr -} - -func (proxy *StubProxy) Run() {} -func (proxy *StubProxy) Close() {} -func (proxy *StubProxy) FrontendAddr() net.Addr { return *proxy.frontendAddr } -func (proxy *StubProxy) BackendAddr() net.Addr { return *proxy.backendAddr } - -func NewStubProxy(frontendAddr, backendAddr net.Addr) (proxy.Proxy, error) { - return &StubProxy{ - frontendAddr: &frontendAddr, - backendAddr: &backendAddr, - }, nil -} - -func TestPortMapper(t *testing.T) { - // FIXME: is this iptables chain still used anywhere? - var chain *iptables.Chain - mapper := &PortMapper{ - tcpMapping: make(map[string]*net.TCPAddr), - tcpProxies: make(map[string]proxy.Proxy), - udpMapping: make(map[string]*net.UDPAddr), - udpProxies: make(map[string]proxy.Proxy), - iptables: chain, - defaultIp: net.IP("0.0.0.0"), - proxyFactoryFunc: NewStubProxy, - } - - dstIp1 := net.ParseIP("192.168.0.1") - dstIp2 := net.ParseIP("192.168.0.2") - srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")} - srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")} - - if err := mapper.Map(dstIp1, 80, srcAddr1); err != nil { - t.Fatalf("Failed to allocate port: %s", err) - } - - if mapper.Map(dstIp1, 80, srcAddr1) == nil { - t.Fatalf("Port is in use - mapping should have failed") - } - - if mapper.Map(dstIp1, 80, srcAddr2) == nil { - t.Fatalf("Port is in use - mapping should have failed") - } - - if err := mapper.Map(dstIp2, 80, srcAddr2); err != nil { - t.Fatalf("Failed to allocate port: %s", err) - } - - if mapper.Unmap(dstIp1, 80, "tcp") != nil { - t.Fatalf("Failed to release port") - } - - if mapper.Unmap(dstIp2, 80, "tcp") != nil { - t.Fatalf("Failed to release port") - } - - if mapper.Unmap(dstIp2, 80, "tcp") == nil { - t.Fatalf("Port already released, but no error reported") - } -} diff --git a/networkdriver/ipallocator/allocator.go b/networkdriver/ipallocator/allocator.go index 33401d5caf..1c5a7b4cc2 100644 --- a/networkdriver/ipallocator/allocator.go +++ b/networkdriver/ipallocator/allocator.go @@ -99,12 +99,17 @@ func getNextIp(address *net.IPNet) (*net.IP, error) { return ip, nil } + var ( + firstNetIP = address.IP.To4().Mask(address.Mask) + firstAsInt = ipToInt(&firstNetIP) + 1 + ) + pos = int32(allocated.PullBack()) for i := int32(0); i < max; i++ { pos = pos%max + 1 next := int32(base + pos) - if next == ownIP { + if next == ownIP || next == firstAsInt { continue } diff --git a/networkdriver/ipallocator/allocator_test.go b/networkdriver/ipallocator/allocator_test.go index 871f143521..5e9fcfc983 100644 --- a/networkdriver/ipallocator/allocator_test.go +++ b/networkdriver/ipallocator/allocator_test.go @@ -213,6 +213,27 @@ func TestIPAllocator(t *testing.T) { } } +func TestAllocateFirstIP(t *testing.T) { + defer reset() + network := &net.IPNet{ + IP: []byte{192, 168, 0, 0}, + Mask: []byte{255, 255, 255, 0}, + } + + firstIP := network.IP.To4().Mask(network.Mask) + first := ipToInt(&firstIP) + 1 + + ip, err := RequestIP(network, nil) + if err != nil { + t.Fatal(err) + } + allocated := ipToInt(ip) + + if allocated == first { + t.Fatalf("allocated ip should not equal first ip: %d == %d", first, allocated) + } +} + func assertIPEquals(t *testing.T, ip1, ip2 *net.IP) { if !ip1.Equal(*ip2) { t.Fatalf("Expected IP %s, got %s", ip1, ip2) diff --git a/networkdriver/portmapper/mapper.go b/networkdriver/portmapper/mapper.go new file mode 100644 index 0000000000..f052c48143 --- /dev/null +++ b/networkdriver/portmapper/mapper.go @@ -0,0 +1,131 @@ +package portmapper + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/iptables" + "github.com/dotcloud/docker/proxy" + "net" + "sync" +) + +type mapping struct { + proto string + userlandProxy proxy.Proxy + host net.Addr + container net.Addr +} + +var ( + chain *iptables.Chain + lock sync.Mutex + + // udp:ip:port + currentMappings = make(map[string]*mapping) + newProxy = proxy.NewProxy +) + +var ( + ErrUnknownBackendAddressType = errors.New("unknown container address type not supported") + ErrPortMappedForIP = errors.New("port is already mapped to ip") + ErrPortNotMapped = errors.New("port is not mapped") +) + +func SetIptablesChain(c *iptables.Chain) { + chain = c +} + +func Map(container net.Addr, hostIP net.IP, hostPort int) error { + lock.Lock() + defer lock.Unlock() + + var m *mapping + switch container.(type) { + case *net.TCPAddr: + m = &mapping{ + proto: "tcp", + host: &net.TCPAddr{IP: hostIP, Port: hostPort}, + container: container, + } + case *net.UDPAddr: + m = &mapping{ + proto: "udp", + host: &net.UDPAddr{IP: hostIP, Port: hostPort}, + container: container, + } + default: + return ErrUnknownBackendAddressType + } + + key := getKey(m.host) + if _, exists := currentMappings[key]; exists { + return ErrPortMappedForIP + } + + containerIP, containerPort := getIPAndPort(m.container) + if err := forward(iptables.Add, m.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil { + return err + } + + p, err := newProxy(m.host, m.container) + if err != nil { + // need to undo the iptables rules before we reutrn + forward(iptables.Delete, m.proto, hostIP, hostPort, containerIP.String(), containerPort) + return err + } + + m.userlandProxy = p + currentMappings[key] = m + + go p.Run() + + return nil +} + +func Unmap(host net.Addr) error { + lock.Lock() + defer lock.Unlock() + + key := getKey(host) + data, exists := currentMappings[key] + if !exists { + return ErrPortNotMapped + } + + data.userlandProxy.Close() + delete(currentMappings, key) + + containerIP, containerPort := getIPAndPort(data.container) + hostIP, hostPort := getIPAndPort(data.host) + if err := forward(iptables.Delete, data.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil { + return err + } + return nil +} + +func getKey(a net.Addr) string { + switch t := a.(type) { + case *net.TCPAddr: + return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "tcp") + case *net.UDPAddr: + return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "udp") + } + return "" +} + +func getIPAndPort(a net.Addr) (net.IP, int) { + switch t := a.(type) { + case *net.TCPAddr: + return t.IP, t.Port + case *net.UDPAddr: + return t.IP, t.Port + } + return nil, 0 +} + +func forward(action iptables.Action, proto string, sourceIP net.IP, sourcePort int, containerIP string, containerPort int) error { + if chain == nil { + return nil + } + return chain.Forward(action, sourceIP, sourcePort, proto, containerIP, containerPort) +} diff --git a/networkdriver/portmapper/mapper_test.go b/networkdriver/portmapper/mapper_test.go new file mode 100644 index 0000000000..05718063e3 --- /dev/null +++ b/networkdriver/portmapper/mapper_test.go @@ -0,0 +1,107 @@ +package portmapper + +import ( + "github.com/dotcloud/docker/pkg/iptables" + "github.com/dotcloud/docker/proxy" + "net" + "testing" +) + +func init() { + // override this func to mock out the proxy server + newProxy = proxy.NewStubProxy +} + +func reset() { + chain = nil + currentMappings = make(map[string]*mapping) +} + +func TestSetIptablesChain(t *testing.T) { + defer reset() + + c := &iptables.Chain{ + Name: "TEST", + Bridge: "192.168.1.1", + } + + if chain != nil { + t.Fatal("chain should be nil at init") + } + + SetIptablesChain(c) + if chain == nil { + t.Fatal("chain should not be nil after set") + } +} + +func TestMapPorts(t *testing.T) { + dstIp1 := net.ParseIP("192.168.0.1") + dstIp2 := net.ParseIP("192.168.0.2") + dstAddr1 := &net.TCPAddr{IP: dstIp1, Port: 80} + dstAddr2 := &net.TCPAddr{IP: dstIp2, Port: 80} + + srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")} + srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")} + + if err := Map(srcAddr1, dstIp1, 80); err != nil { + t.Fatalf("Failed to allocate port: %s", err) + } + + if Map(srcAddr1, dstIp1, 80) == nil { + t.Fatalf("Port is in use - mapping should have failed") + } + + if Map(srcAddr2, dstIp1, 80) == nil { + t.Fatalf("Port is in use - mapping should have failed") + } + + if err := Map(srcAddr2, dstIp2, 80); err != nil { + t.Fatalf("Failed to allocate port: %s", err) + } + + if Unmap(dstAddr1) != nil { + t.Fatalf("Failed to release port") + } + + if Unmap(dstAddr2) != nil { + t.Fatalf("Failed to release port") + } + + if Unmap(dstAddr2) == nil { + t.Fatalf("Port already released, but no error reported") + } +} + +func TestGetUDPKey(t *testing.T) { + addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53} + + key := getKey(addr) + + if expected := "192.168.1.5:53/udp"; key != expected { + t.Fatalf("expected key %s got %s", expected, key) + } +} + +func TestGetTCPKey(t *testing.T) { + addr := &net.TCPAddr{IP: net.ParseIP("192.168.1.5"), Port: 80} + + key := getKey(addr) + + if expected := "192.168.1.5:80/tcp"; key != expected { + t.Fatalf("expected key %s got %s", expected, key) + } +} + +func TestGetUDPIPAndPort(t *testing.T) { + addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53} + + ip, port := getIPAndPort(addr) + if expected := "192.168.1.5"; ip.String() != expected { + t.Fatalf("expected ip %s got %s", expected, ip) + } + + if ep := 53; port != ep { + t.Fatalf("expected port %d got %d", ep, port) + } +} diff --git a/pkg/iptables/iptables.go b/pkg/iptables/iptables.go index 0438bcbd88..2df93657ac 100644 --- a/pkg/iptables/iptables.go +++ b/pkg/iptables/iptables.go @@ -73,6 +73,23 @@ func (c *Chain) Forward(action Action, ip net.IP, port int, proto, dest_addr str } else if len(output) != 0 { return fmt.Errorf("Error iptables forward: %s", output) } + + fAction := action + if fAction == Add { + fAction = "-I" + } + if output, err := Raw(string(fAction), "FORWARD", + "!", "-i", c.Bridge, + "-o", c.Bridge, + "-p", proto, + "-d", daddr, + "--dport", strconv.Itoa(port), + "-j", "ACCEPT"); err != nil { + return err + } else if len(output) != 0 { + return fmt.Errorf("Error iptables forward: %s", output) + } + return nil } diff --git a/proxy/stub_proxy.go b/proxy/stub_proxy.go new file mode 100644 index 0000000000..7684427058 --- /dev/null +++ b/proxy/stub_proxy.go @@ -0,0 +1,22 @@ +package proxy + +import ( + "net" +) + +type StubProxy struct { + frontendAddr net.Addr + backendAddr net.Addr +} + +func (p *StubProxy) Run() {} +func (p *StubProxy) Close() {} +func (p *StubProxy) FrontendAddr() net.Addr { return p.frontendAddr } +func (p *StubProxy) BackendAddr() net.Addr { return p.backendAddr } + +func NewStubProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) { + return &StubProxy{ + frontendAddr: frontendAddr, + backendAddr: backendAddr, + }, nil +} diff --git a/runtime.go b/runtime.go index 3d47a50398..9d2c5e808d 100644 --- a/runtime.go +++ b/runtime.go @@ -9,6 +9,7 @@ import ( "github.com/dotcloud/docker/execdriver/lxc" "github.com/dotcloud/docker/graphdriver" "github.com/dotcloud/docker/graphdriver/aufs" + _ "github.com/dotcloud/docker/graphdriver/btrfs" _ "github.com/dotcloud/docker/graphdriver/devmapper" _ "github.com/dotcloud/docker/graphdriver/vfs" "github.com/dotcloud/docker/networkdriver/portallocator" diff --git a/server.go b/server.go index ce6024d919..36bd670776 100644 --- a/server.go +++ b/server.go @@ -43,8 +43,7 @@ func init() { // The signals SIGINT, SIGQUIT and SIGTERM are intercepted for cleanup. func jobInitApi(job *engine.Job) engine.Status { job.Logf("Creating server") - // FIXME: ImportEnv deprecates ConfigFromJob - srv, err := NewServer(job.Eng, ConfigFromJob(job)) + srv, err := NewServer(job.Eng, DaemonConfigFromJob(job)) if err != nil { return job.Error(err) } @@ -1012,7 +1011,7 @@ func (srv *Server) Containers(job *engine.Job) engine.Status { }, -1) for _, container := range srv.runtime.List() { - if !container.State.IsRunning() && !all && n == -1 && since == "" && before == "" { + if !container.State.IsRunning() && !all && n <= 0 && since == "" && before == "" { continue } if before != "" && !foundBefore { @@ -1021,7 +1020,7 @@ func (srv *Server) Containers(job *engine.Job) engine.Status { } continue } - if displayed == n { + if n > 0 && displayed == n { break } if container.ID == since || utils.TruncateID(container.ID) == since { @@ -1644,10 +1643,7 @@ func (srv *Server) ContainerCreate(job *engine.Job) engine.Status { } else if len(job.Args) > 1 { return job.Errorf("Usage: %s", job.Name) } - var config Config - if err := job.ExportEnv(&config); err != nil { - return job.Error(err) - } + config := ContainerConfigFromJob(job) if config.Memory != 0 && config.Memory < 524288 { return job.Errorf("Minimum memory limit allowed is 512k") } @@ -1668,7 +1664,7 @@ func (srv *Server) ContainerCreate(job *engine.Job) engine.Status { config.Dns = defaultDns } - container, buildWarnings, err := srv.runtime.Create(&config, name) + container, buildWarnings, err := srv.runtime.Create(config, name) if err != nil { if srv.runtime.graph.IsNotExist(err) { _, tag := utils.ParseRepositoryTag(config.Image) @@ -1699,10 +1695,12 @@ func (srv *Server) ContainerRestart(job *engine.Job) engine.Status { if len(job.Args) != 1 { return job.Errorf("Usage: %s CONTAINER\n", job.Name) } - name := job.Args[0] - t := job.GetenvInt("t") - if t == -1 { - t = 10 + var ( + name = job.Args[0] + t = 10 + ) + if job.EnvExists("t") { + t = job.GetenvInt("t") } if container := srv.runtime.Get(name); container != nil { if err := container.Restart(int(t)); err != nil { @@ -2073,10 +2071,7 @@ func (srv *Server) ContainerStart(job *engine.Job) engine.Status { } // If no environment was set, then no hostconfig was passed. if len(job.Environ()) > 0 { - var hostConfig HostConfig - if err := job.ExportEnv(&hostConfig); err != nil { - return job.Error(err) - } + hostConfig := ContainerHostConfigFromJob(job) // Validate the HostConfig binds. Make sure that: // 1) the source of a bind mount isn't / // The bind mount "/:/foo" isn't allowed. @@ -2101,10 +2096,10 @@ func (srv *Server) ContainerStart(job *engine.Job) engine.Status { } } // Register any links from the host config before starting the container - if err := srv.RegisterLinks(container, &hostConfig); err != nil { + if err := srv.RegisterLinks(container, hostConfig); err != nil { return job.Error(err) } - container.hostConfig = &hostConfig + container.hostConfig = hostConfig container.ToDisk() } if err := container.Start(); err != nil { @@ -2119,10 +2114,12 @@ func (srv *Server) ContainerStop(job *engine.Job) engine.Status { if len(job.Args) != 1 { return job.Errorf("Usage: %s CONTAINER\n", job.Name) } - name := job.Args[0] - t := job.GetenvInt("t") - if t == -1 { - t = 10 + var ( + name = job.Args[0] + t = 10 + ) + if job.EnvExists("t") { + t = job.GetenvInt("t") } if container := srv.runtime.Get(name); container != nil { if err := container.Stop(int(t)); err != nil {