Move Docker "health-check" to DockerBuilder

This commit is contained in:
Ciprian Hacman 2019-12-29 18:42:59 +02:00
parent 88600407f4
commit 507230fe75
11 changed files with 392 additions and 28 deletions

View File

@ -1017,6 +1017,12 @@ func (b *DockerBuilder) Build(c *fi.ModelBuilderContext) error {
return err
}
if b.Distribution.IsDebianFamily() {
c.AddTask(b.buildSystemdHealthCheckScript())
c.AddTask(b.buildSystemdHealthCheckService())
c.AddTask(b.buildSystemdHealthCheckTimer())
}
return nil
}
@ -1147,6 +1153,60 @@ func (b *DockerBuilder) buildSystemdService(dockerVersionMajor int, dockerVersio
return service
}
func (b *DockerBuilder) buildSystemdHealthCheckScript() *nodetasks.File {
script := &nodetasks.File{
Path: "/opt/kops/bin/docker-healthcheck",
Contents: fi.NewStringResource(resources.DockerHealthCheck),
Type: nodetasks.FileType_File,
Mode: s("0755"),
}
return script
}
func (b *DockerBuilder) buildSystemdHealthCheckService() *nodetasks.Service {
manifest := &systemd.Manifest{}
manifest.Set("Unit", "Description", "Run docker-healthcheck once")
manifest.Set("Unit", "Documentation", "https://kops.sigs.k8s.io")
manifest.Set("Service", "Type", "oneshot")
manifest.Set("Service", "ExecStart", "/opt/kops/bin/docker-healthcheck")
manifest.Set("Install", "WantedBy", "multi-user.target")
manifestString := manifest.Render()
klog.V(8).Infof("Built service manifest %q\n%s", "docker-healthcheck.service", manifestString)
service := &nodetasks.Service{
Name: "docker-healthcheck.service",
Definition: s(manifestString),
}
service.InitDefaults()
return service
}
func (b *DockerBuilder) buildSystemdHealthCheckTimer() *nodetasks.Service {
manifest := &systemd.Manifest{}
manifest.Set("Unit", "Description", "Trigger docker-healthcheck periodically")
manifest.Set("Unit", "Documentation", "https://kops.sigs.k8s.io")
manifest.Set("Timer", "OnUnitInactiveSec", "10s")
manifest.Set("Timer", "Unit", "docker-healthcheck.service")
manifest.Set("Install", "WantedBy", "multi-user.target")
manifestString := manifest.Render()
klog.V(8).Infof("Built timer manifest %q\n%s", "docker-healthcheck.timer", manifestString)
service := &nodetasks.Service{
Name: "docker-healthcheck.timer",
Definition: s(manifestString),
}
service.InitDefaults()
return service
}
// buildContainerOSConfigurationDropIn is responsible for configuring the docker daemon options
func (b *DockerBuilder) buildContainerOSConfigurationDropIn(c *fi.ModelBuilderContext) error {
lines := []string{

View File

@ -3,8 +3,9 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"containerd.go",
"docker.go",
"containerd_license.go",
"docker_healthcheck.go",
"docker_license.go",
],
importpath = "k8s.io/kops/nodeup/pkg/model/resources",
visibility = ["//visibility:public"],

View File

@ -1,6 +1,24 @@
#!/bin/bash
/*
Copyright 2019 The Kubernetes Authors.
# Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package resources
var DockerHealthCheck = `#!/bin/bash
# Copyright 2019 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,7 +36,7 @@
# of docker. If it detects a failure, it will restart docker using systemctl.
healthcheck() {
if output=`timeout 60 docker network ls`; then
if output=` + "`timeout 60 docker network ls`" + `; then
echo "$output" | fgrep -qw host || {
echo "docker 'host' network missing"
return 1
@ -47,7 +65,7 @@ echo "docker still unresponsive; triggering docker restart"
systemctl stop docker
echo "wait all tcp sockets to close"
sleep `cat /proc/sys/net/ipv4/tcp_fin_timeout`
sleep ` + "`cat /proc/sys/net/ipv4/tcp_fin_timeout`" + `
sleep 10
systemctl start docker
@ -61,3 +79,4 @@ if healthcheck; then
fi
echo "docker still failing"
`

View File

@ -4,6 +4,74 @@ contents: |-
path: /etc/sysconfig/docker
type: file
---
contents: |
#!/bin/bash
# Copyright 2019 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is intended to be run periodically, to check the health
# of docker. If it detects a failure, it will restart docker using systemctl.
healthcheck() {
if output=`timeout 60 docker network ls`; then
echo "$output" | fgrep -qw host || {
echo "docker 'host' network missing"
return 1
}
else
echo "docker returned $?"
return 1
fi
}
if healthcheck; then
echo "docker healthy"
exit 0
fi
echo "docker failed"
echo "Giving docker 30 seconds grace before restarting"
sleep 30
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still unresponsive; triggering docker restart"
systemctl stop docker
echo "wait all tcp sockets to close"
sleep `cat /proc/sys/net/ipv4/tcp_fin_timeout`
sleep 10
systemctl start docker
echo "Waiting 120 seconds to give docker time to start"
sleep 60
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still failing"
mode: "0755"
path: /opt/kops/bin/docker-healthcheck
type: file
---
contents: |2
@ -217,6 +285,40 @@ preventStart: true
source: http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.12.1-0~xenial_amd64.deb
version: 1.12.1-0~xenial
---
Name: docker-healthcheck.service
definition: |
[Unit]
Description=Run docker-healthcheck once
Documentation=https://kops.sigs.k8s.io
[Service]
Type=oneshot
ExecStart=/opt/kops/bin/docker-healthcheck
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker-healthcheck.timer
definition: |
[Unit]
Description=Trigger docker-healthcheck periodically
Documentation=https://kops.sigs.k8s.io
[Timer]
OnUnitInactiveSec=10s
Unit=docker-healthcheck.service
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker.service
definition: |
[Unit]

View File

@ -4,6 +4,74 @@ contents: |-
path: /etc/sysconfig/docker
type: file
---
contents: |
#!/bin/bash
# Copyright 2019 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is intended to be run periodically, to check the health
# of docker. If it detects a failure, it will restart docker using systemctl.
healthcheck() {
if output=`timeout 60 docker network ls`; then
echo "$output" | fgrep -qw host || {
echo "docker 'host' network missing"
return 1
}
else
echo "docker returned $?"
return 1
fi
}
if healthcheck; then
echo "docker healthy"
exit 0
fi
echo "docker failed"
echo "Giving docker 30 seconds grace before restarting"
sleep 30
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still unresponsive; triggering docker restart"
systemctl stop docker
echo "wait all tcp sockets to close"
sleep `cat /proc/sys/net/ipv4/tcp_fin_timeout`
sleep 10
systemctl start docker
echo "Waiting 120 seconds to give docker time to start"
sleep 60
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still failing"
mode: "0755"
path: /opt/kops/bin/docker-healthcheck
type: file
---
contents: |2
@ -217,6 +285,40 @@ preventStart: true
source: http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.12.3-0~xenial_amd64.deb
version: 1.12.3-0~xenial
---
Name: docker-healthcheck.service
definition: |
[Unit]
Description=Run docker-healthcheck once
Documentation=https://kops.sigs.k8s.io
[Service]
Type=oneshot
ExecStart=/opt/kops/bin/docker-healthcheck
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker-healthcheck.timer
definition: |
[Unit]
Description=Trigger docker-healthcheck periodically
Documentation=https://kops.sigs.k8s.io
[Timer]
OnUnitInactiveSec=10s
Unit=docker-healthcheck.service
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker.service
definition: |
[Unit]
@ -236,7 +338,6 @@ definition: |
LimitNOFILE=1048576
LimitNPROC=1048576
LimitCORE=infinity
TasksMax=infinity
Restart=always
RestartSec=2s
StartLimitInterval=0

View File

@ -4,6 +4,74 @@ contents: |-
path: /etc/sysconfig/docker
type: file
---
contents: |
#!/bin/bash
# Copyright 2019 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is intended to be run periodically, to check the health
# of docker. If it detects a failure, it will restart docker using systemctl.
healthcheck() {
if output=`timeout 60 docker network ls`; then
echo "$output" | fgrep -qw host || {
echo "docker 'host' network missing"
return 1
}
else
echo "docker returned $?"
return 1
fi
}
if healthcheck; then
echo "docker healthy"
exit 0
fi
echo "docker failed"
echo "Giving docker 30 seconds grace before restarting"
sleep 30
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still unresponsive; triggering docker restart"
systemctl stop docker
echo "wait all tcp sockets to close"
sleep `cat /proc/sys/net/ipv4/tcp_fin_timeout`
sleep 10
systemctl start docker
echo "Waiting 120 seconds to give docker time to start"
sleep 60
if healthcheck; then
echo "docker recovered"
exit 0
fi
echo "docker still failing"
mode: "0755"
path: /opt/kops/bin/docker-healthcheck
type: file
---
contents: |2
@ -217,6 +285,40 @@ preventStart: true
source: http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.12.3-0~xenial_amd64.deb
version: 1.12.3-0~xenial
---
Name: docker-healthcheck.service
definition: |
[Unit]
Description=Run docker-healthcheck once
Documentation=https://kops.sigs.k8s.io
[Service]
Type=oneshot
ExecStart=/opt/kops/bin/docker-healthcheck
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker-healthcheck.timer
definition: |
[Unit]
Description=Trigger docker-healthcheck periodically
Documentation=https://kops.sigs.k8s.io
[Timer]
OnUnitInactiveSec=10s
Unit=docker-healthcheck.service
[Install]
WantedBy=multi-user.target
enabled: true
manageState: true
running: true
smartRestart: true
---
Name: docker.service
definition: |
[Unit]

View File

@ -1,9 +0,0 @@
[Unit]
Description=Run docker-healthcheck once
[Service]
Type=oneshot
ExecStart=/opt/kubernetes/helpers/docker-healthcheck
[Install]
WantedBy=multi-user.target

View File

@ -1,9 +0,0 @@
[Unit]
Description=Trigger docker-healthcheck periodically
[Timer]
OnUnitInactiveSec=10s
Unit=docker-healthcheck.service
[Install]
WantedBy=multi-user.target