k8systemd: run k8s workloads in systemd

Support running `podman play kube` in systemd by exploiting the
previously added "service containers".  During `play kube`, a service
container is started before all the pods and containers, and is stopped
last.  The service container communicates its conmon PID via sdnotify.

Add a new systemd template to dispatch such k8s workloads.  The argument
of the template is the path to the k8s file.  Note that the path must be
escaped for systemd not to bark:

Let's assume we have a `top.yaml` file in the home directory:
```
$ escaped=$(systemd-escape ~/top.yaml)
$ systemctl --user start podman-play-kube@$escaped.service
```

Closes: https://issues.redhat.com/browse/RUN-1287
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg 2022-05-11 15:02:06 +02:00
parent eb26fa45f1
commit 8684d41e38
14 changed files with 248 additions and 30 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@
/build/
/conmon/
contrib/spec/podman.spec
contrib/systemd/*/*.service
*.coverprofile
coverprofile
/.coverage

View File

@ -832,7 +832,8 @@ install.docker-full: install.docker install.docker-docs
ifneq (,$(findstring systemd,$(BUILDTAGS)))
PODMAN_UNIT_FILES = contrib/systemd/auto-update/podman-auto-update.service \
contrib/systemd/system/podman.service \
contrib/systemd/system/podman-restart.service
contrib/systemd/system/podman-restart.service \
contrib/systemd/system/podman-play-kube@.service
%.service: %.service.in
sed -e 's;@@PODMAN@@;$(BINDIR)/podman;g' $< >$@.tmp.$$ \
@ -846,12 +847,14 @@ install.systemd: $(PODMAN_UNIT_FILES)
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.socket ${DESTDIR}${USERSYSTEMDDIR}/podman.socket
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.service ${DESTDIR}${USERSYSTEMDDIR}/podman.service
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-restart.service ${DESTDIR}${USERSYSTEMDDIR}/podman-restart.service
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-play-kube@.service ${DESTDIR}${USERSYSTEMDDIR}/podman-play-kube@.service
# System services
install ${SELINUXOPT} -m 644 contrib/systemd/auto-update/podman-auto-update.service ${DESTDIR}${SYSTEMDDIR}/podman-auto-update.service
install ${SELINUXOPT} -m 644 contrib/systemd/auto-update/podman-auto-update.timer ${DESTDIR}${SYSTEMDDIR}/podman-auto-update.timer
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.socket ${DESTDIR}${SYSTEMDDIR}/podman.socket
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman.service ${DESTDIR}${SYSTEMDDIR}/podman.service
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-restart.service ${DESTDIR}${SYSTEMDDIR}/podman-restart.service
install ${SELINUXOPT} -m 644 contrib/systemd/system/podman-play-kube@.service ${DESTDIR}${SYSTEMDDIR}/podman-play-kube@.service
rm -f $(PODMAN_UNIT_FILES)
else
install.systemd:

View File

@ -0,0 +1,18 @@
[Unit]
Description=A template for running K8s workloads via podman-play-kube
Documentation=man:podman-play-kube(1)
Wants=network-online.target
After=network-online.target
RequiresMountsFor=%t/containers
[Service]
Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=never
TimeoutStopSec=70
ExecStart=@@PODMAN@@ play kube --replace --service-container=true %I
ExecStop=@@PODMAN@@ play kube --down %I
Type=notify
NotifyAccess=all
[Install]
WantedBy=default.target

View File

@ -171,7 +171,7 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver
Mounts: inspectMounts,
Dependencies: c.Dependencies(),
IsInfra: c.IsInfra(),
IsService: c.isService(),
IsService: c.IsService(),
}
if c.state.ConfigPath != "" {

View File

@ -31,7 +31,7 @@ func (c *Container) validate() error {
// A container cannot be marked as an infra and service container at
// the same time.
if c.IsInfra() && c.isService() {
if c.IsInfra() && c.IsService() {
return fmt.Errorf("cannot be infra and service container at the same time: %w", define.ErrInvalidArg)
}

View File

@ -644,6 +644,16 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo
return err
}
if c.IsService() {
canStop, err := c.canStopServiceContainer()
if err != nil {
return err
}
if !canStop {
return fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ","))
}
}
// If we're not force-removing, we need to check if we're in a good
// state to remove.
if !force {
@ -903,6 +913,16 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol
}
}
if c.IsService() {
canStop, err := c.canStopServiceContainer()
if err != nil {
return id, err
}
if !canStop {
return id, fmt.Errorf("container %s is the service container of pod(s) %s and cannot be removed without removing the pod(s)", c.ID(), strings.Join(c.state.Service.Pods, ","))
}
}
var cleanupErr error
// Remove the container from the state
if c.config.Pod != "" {

View File

@ -54,11 +54,12 @@ func (c *Container) addServicePodLocked(id string) error {
return c.save()
}
func (c *Container) isService() bool {
// IsService returns true when the container is a "service container".
func (c *Container) IsService() bool {
return c.config.IsService
}
// canStopServiceContainer returns true if all pods of the service are stopped.
// canStopServiceContainerLocked returns true if all pods of the service are stopped.
// Note that the method acquires the container lock.
func (c *Container) canStopServiceContainerLocked() (bool, error) {
c.lock.Lock()
@ -67,10 +68,16 @@ func (c *Container) canStopServiceContainerLocked() (bool, error) {
return false, err
}
if !c.isService() {
if !c.IsService() {
return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID())
}
return c.canStopServiceContainer()
}
// canStopServiceContainer returns true if all pods of the service are stopped.
// Note that the method expects the container to be locked.
func (c *Container) canStopServiceContainer() (bool, error) {
for _, id := range c.state.Service.Pods {
pod, err := c.runtime.LookupPod(id)
if err != nil {
@ -163,7 +170,7 @@ func (c *Container) canRemoveServiceContainerLocked() (bool, error) {
return false, err
}
if !c.isService() {
if !c.IsService() {
return false, fmt.Errorf("internal error: checking service: container %s is not a service container", c.ID())
}

View File

@ -292,7 +292,13 @@ func (ic *ContainerEngine) removeContainer(ctx context.Context, ctr *libpod.Cont
logrus.Debugf("Failed to remove container %s: %s", ctr.ID(), err.Error())
switch errors.Cause(err) {
case define.ErrNoSuchCtr:
if options.Ignore {
// Ignore if the container does not exist (anymore) when either
// it has been requested by the user of if the container is a
// service one. Service containers are removed along with its
// pods which in turn are removed along with their infra
// container. Hence, there is an inherent race when removing
// infra containers with service containers in parallel.
if options.Ignore || ctr.IsService() {
logrus.Debugf("Ignoring error (--allow-missing): %v", err)
return nil
}

View File

@ -37,7 +37,15 @@ import (
// createServiceContainer creates a container that can later on
// be associated with the pods of a K8s yaml. It will be started along with
// the first pod.
func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name string) (*libpod.Container, error) {
func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name string, options entities.PlayKubeOptions) (*libpod.Container, error) {
// Make sure to replace the service container as well if requested by
// the user.
if options.Replace {
if _, err := ic.ContainerRm(ctx, []string{name}, entities.RmOptions{Force: true, Ignore: true}); err != nil {
return nil, fmt.Errorf("replacing service container: %w", err)
}
}
// Similar to infra containers, a service container is using the pause image.
image, err := generate.PullOrBuildInfraImage(ic.Libpod, "")
if err != nil {
@ -65,6 +73,7 @@ func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name stri
return nil, fmt.Errorf("creating runtime spec for service container: %w", err)
}
opts = append(opts, libpod.WithIsService())
opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeConmon))
// Create a new libpod container based on the spec.
ctr, err := ic.Libpod.NewContainer(ctx, runtimeSpec, spec, false, opts...)
@ -75,6 +84,17 @@ func (ic *ContainerEngine) createServiceContainer(ctx context.Context, name stri
return ctr, nil
}
// Creates the name for a service container based on the provided content of a
// K8s yaml file.
func serviceContainerName(content []byte) string {
// The name of the service container is the first 12
// characters of the yaml file's hash followed by the
// '-service' suffix to guarantee a predictable and
// discoverable name.
hash := digest.FromBytes(content).Encoded()
return hash[0:12] + "-service"
}
func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options entities.PlayKubeOptions) (_ *entities.PlayKubeReport, finalErr error) {
report := &entities.PlayKubeReport{}
validKinds := 0
@ -112,12 +132,7 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options
// TODO: create constants for the various "kinds" of yaml files.
var serviceContainer *libpod.Container
if options.ServiceContainer && (kind == "Pod" || kind == "Deployment") {
// The name of the service container is the first 12
// characters of the yaml file's hash followed by the
// '-service' suffix to guarantee a predictable and
// discoverable name.
hash := digest.FromBytes(content).Encoded()
ctr, err := ic.createServiceContainer(ctx, hash[0:12]+"-service")
ctr, err := ic.createServiceContainer(ctx, serviceContainerName(content), options)
if err != nil {
return nil, err
}
@ -433,6 +448,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
podSpec.PodSpecGen.NoInfra = false
podSpec.PodSpecGen.InfraContainerSpec = specgen.NewSpecGenerator(infraImage, false)
podSpec.PodSpecGen.InfraContainerSpec.NetworkOptions = p.NetworkOptions
podSpec.PodSpecGen.InfraContainerSpec.SdNotifyMode = define.SdNotifyModeIgnore
err = specgenutil.FillOutSpecGen(podSpec.PodSpecGen.InfraContainerSpec, &infraOptions, []string{})
if err != nil {
@ -516,10 +532,12 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
if err != nil {
return nil, err
}
specGen.SdNotifyMode = define.SdNotifyModeIgnore
rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil)
if err != nil {
return nil, err
}
opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore))
ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...)
if err != nil {
return nil, err
@ -570,6 +588,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
if err != nil {
return nil, err
}
opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore))
ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...)
if err != nil {
return nil, err
@ -942,5 +961,6 @@ func (ic *ContainerEngine) PlayKubeDown(ctx context.Context, body io.Reader, _ e
if err != nil {
return nil, err
}
return reports, nil
}

View File

@ -242,11 +242,13 @@ done
%{_unitdir}/%{name}.service
%{_unitdir}/%{name}.socket
%{_unitdir}/%{name}-restart.service
%{_unitdir}/%{name}-play-kube@.service
%{_userunitdir}/%{name}-auto-update.service
%{_userunitdir}/%{name}-auto-update.timer
%{_userunitdir}/%{name}.service
%{_userunitdir}/%{name}.socket
%{_userunitdir}/%{name}-restart.service
%{_userunitdir}/%{name}-play-kube@.service
%{_tmpfilesdir}/%{name}.conf
%if 0%{?fedora} >= 36
%{_modulesloaddir}/%{name}-iptables.conf

View File

@ -292,4 +292,80 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
run_podman network rm -f $netname
}
@test "podman-play-kube@.service template" {
skip_if_remote "systemd units do not work with remote clients"
# If running from a podman source directory, build and use the source
# version of the play-kube-@ unit file
unit_name="podman-play-kube@.service"
unit_file="contrib/systemd/system/${unit_name}"
if [[ -e ${unit_file}.in ]]; then
echo "# [Building & using $unit_name from source]" >&3
BINDIR=$(dirname $PODMAN) make $unit_file
cp $unit_file $UNIT_DIR/$unit_name
fi
# Create the YAMl file
yaml_source="$PODMAN_TMPDIR/test.yaml"
cat >$yaml_source <<EOF
apiVersion: v1
kind: Pod
metadata:
labels:
app: test
name: test_pod
spec:
containers:
- command:
- top
image: $IMAGE
name: test
resources: {}
EOF
# Dispatch the YAML file
service_name="podman-play-kube@$(systemd-escape $yaml_source).service"
systemctl start $service_name
systemctl is-active $service_name
# The name of the service container is predictable: the first 12 characters
# of the hash of the YAML file followed by the "-service" suffix
yaml_sha=$(sha256sum $yaml_source)
service_container="${yaml_sha:0:12}-service"
# Make sure that the service container exists and runs.
run_podman container inspect $service_container --format "{{.State.Running}}"
is "$output" "true"
# Check for an error when trying to remove the service container
run_podman 125 container rm $service_container
is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)"
# Kill the pod and make sure the service is not running.
# The restart policy is set to "never" since there is no
# design yet for propagating exit codes up to the service
# container.
run_podman pod kill test_pod
for i in {0..5}; do
run systemctl is-failed $service_name
if [[ $output == "failed" ]]; then
break
fi
sleep 0.5
done
is "$output" "failed" "systemd service transitioned to 'failed' state"
# Now stop and start the service again.
systemctl stop $service_name
systemctl start $service_name
systemctl is-active $service_name
run_podman container inspect $service_container --format "{{.State.Running}}"
is "$output" "true"
# Clean up
systemctl stop $service_name
run_podman 1 container exists $service_container
run_podman 1 pod exists test_pod
}
# vim: filetype=sh

View File

@ -172,4 +172,52 @@ READY=1" "sdnotify sent MAINPID and READY"
_stop_socat
}
@test "sdnotify : play kube" {
# Create the YAMl file
yaml_source="$PODMAN_TMPDIR/test.yaml"
cat >$yaml_source <<EOF
apiVersion: v1
kind: Pod
metadata:
labels:
app: test
name: test_pod
spec:
containers:
- command:
- top
image: $IMAGE
name: test
resources: {}
EOF
# The name of the service container is predictable: the first 12 characters
# of the hash of the YAML file followed by the "-service" suffix
yaml_sha=$(sha256sum $yaml_source)
service_container="${yaml_sha:0:12}-service"
export NOTIFY_SOCKET=$PODMAN_TMPDIR/conmon.sock
_start_socat
run_podman play kube --service-container=true $yaml_source
run_podman container inspect $service_container --format "{{.State.ConmonPid}}"
mainPID="$output"
# The 'echo's help us debug failed runs
run cat $_SOCAT_LOG
echo "socat log:"
echo "$output"
is "$output" "MAINPID=$mainPID
READY=1" "sdnotify sent MAINPID and READY"
_stop_socat
# Clean up pod and pause image
run_podman play kube --down $PODMAN_TMPDIR/test.yaml
run_podman version --format "{{.Server.Version}}-{{.Server.Built}}"
podman rmi -f localhost/podman-pause:$output
}
# vim: filetype=sh

View File

@ -103,10 +103,9 @@ RELABEL="system_u:object_r:container_file_t:s0"
@test "podman play --service-container" {
skip_if_remote "service containers only work locally"
TESTDIR=$PODMAN_TMPDIR/testdir
mkdir -p $TESTDIR
yaml="
# Create the YAMl file
yaml_source="$PODMAN_TMPDIR/test.yaml"
cat >$yaml_source <<EOF
apiVersion: v1
kind: Pod
metadata:
@ -120,13 +119,16 @@ spec:
image: $IMAGE
name: test
resources: {}
"
EOF
run_podman play kube --service-container=true $yaml_source
echo "$yaml" > $PODMAN_TMPDIR/test.yaml
run_podman play kube --service-container=true $PODMAN_TMPDIR/test.yaml
# The name of the service container is predictable: the first 12 characters
# of the hash of the YAML file followed by the "-service" suffix
yaml_sha=$(sha256sum $yaml_source)
service_container="${yaml_sha:0:12}-service"
# Make sure that the service container exists and runs.
run_podman container inspect "352a88685060-service" --format "{{.State.Running}}"
run_podman container inspect $service_container --format "{{.State.Running}}"
is "$output" "true"
# Stop the *main* container and make sure that
@ -135,24 +137,26 @@ spec:
# #) The service container is marked as an service container
run_podman stop test_pod-test
_ensure_pod_state test_pod Exited
run_podman container inspect "352a88685060-service" --format "{{.State.Running}}"
is "$output" "false"
run_podman container inspect "352a88685060-service" --format "{{.IsService}}"
_ensure_container_running $service_container false
run_podman container inspect $service_container --format "{{.IsService}}"
is "$output" "true"
# Restart the pod, make sure the service is running again
run_podman pod restart test_pod
run_podman container inspect "352a88685060-service" --format "{{.State.Running}}"
run_podman container inspect $service_container --format "{{.State.Running}}"
is "$output" "true"
# Check for an error when trying to remove the service container
run_podman 125 container rm $service_container
is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)"
# Kill the pod and make sure the service is not running
run_podman pod kill test_pod
run_podman container inspect "352a88685060-service" --format "{{.State.Running}}"
is "$output" "false"
_ensure_container_running $service_container false
# Remove the pod and make sure the service is removed along with it
run_podman pod rm test_pod
run_podman 1 container exists "352a88685060-service"
run_podman 1 container exists $service_container
}
@test "podman play --network" {

View File

@ -405,6 +405,19 @@ function _ensure_pod_state() {
is "$output" "$2" "unexpected pod state"
}
# Wait for the container's (1st arg) running state (2nd arg)
function _ensure_container_running() {
for i in {0..5}; do
run_podman container inspect $1 --format "{{.State.Running}}"
if [[ $output == "$2" ]]; then
break
fi
sleep 0.5
done
is "$output" "$2" "unexpected pod state"
}
###########################
# _add_label_if_missing # make sure skip messages include rootless/remote
###########################