kube sdnotify: run proxies for the lifespan of the service

As outlined in #16076, a subsequent BARRIER *may* follow the READY
message sent by a container.  To correctly imitate the behavior of
systemd's NOTIFY_SOCKET, the notify proxies span up by `kube play` must
hence process messages for the entirety of the workload.

We know that the workload is done and that all containers and pods have
exited when the service container exits.  Hence, all proxies are closed
at that time.

The above changes imply that Podman runs for the entirety of the
workload and will henceforth act as the MAINPID when running inside of
systemd.  Prior to this change, the service container acted as the
MAINPID which is now not possible anymore; Podman would be killed
immediately on exit of the service container and could not clean up.

The kube template now correctly transitions to in-active instead of
failed in systemd.

Fixes: #16076
Fixes: #16515
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg 2022-12-02 12:51:30 +01:00
parent 8c3af71862
commit 4fa307f149
6 changed files with 211 additions and 114 deletions

View File

@ -155,7 +155,25 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options
var configMaps []v1.ConfigMap var configMaps []v1.ConfigMap
ranContainers := false ranContainers := false
// FIXME: both, the service container and the proxies, should ideally
// be _state_ of an object. The Kube code below is quite Spaghetti-code
// which we should refactor at some point to make it easier to extend
// (via shared state instead of passing data around) and make it more
// maintainable long term.
var serviceContainer *libpod.Container var serviceContainer *libpod.Container
var notifyProxies []*notifyproxy.NotifyProxy
defer func() {
// Close the notify proxy on return. At that point we know
// that a) all containers have send their READY message and
// that b) the service container has exited (and hence all
// containers).
for _, proxy := range notifyProxies {
if err := proxy.Close(); err != nil {
logrus.Errorf("Closing notify proxy %q: %v", proxy.SocketPath(), err)
}
}
}()
// create pod on each document if it is a pod or deployment // create pod on each document if it is a pod or deployment
// any other kube kind will be skipped // any other kube kind will be skipped
for _, document := range documentList { for _, document := range documentList {
@ -205,10 +223,11 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options
podYAML.Annotations[name] = val podYAML.Annotations[name] = val
} }
r, err := ic.playKubePod(ctx, podTemplateSpec.ObjectMeta.Name, &podTemplateSpec, options, &ipIndex, podYAML.Annotations, configMaps, serviceContainer) r, proxies, err := ic.playKubePod(ctx, podTemplateSpec.ObjectMeta.Name, &podTemplateSpec, options, &ipIndex, podYAML.Annotations, configMaps, serviceContainer)
if err != nil { if err != nil {
return nil, err return nil, err
} }
notifyProxies = append(notifyProxies, proxies...)
report.Pods = append(report.Pods, r.Pods...) report.Pods = append(report.Pods, r.Pods...)
validKinds++ validKinds++
@ -220,10 +239,11 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options
return nil, fmt.Errorf("unable to read YAML as Kube Deployment: %w", err) return nil, fmt.Errorf("unable to read YAML as Kube Deployment: %w", err)
} }
r, err := ic.playKubeDeployment(ctx, &deploymentYAML, options, &ipIndex, configMaps, serviceContainer) r, proxies, err := ic.playKubeDeployment(ctx, &deploymentYAML, options, &ipIndex, configMaps, serviceContainer)
if err != nil { if err != nil {
return nil, err return nil, err
} }
notifyProxies = append(notifyProxies, proxies...)
report.Pods = append(report.Pods, r.Pods...) report.Pods = append(report.Pods, r.Pods...)
validKinds++ validKinds++
@ -289,23 +309,20 @@ func (ic *ContainerEngine) PlayKube(ctx context.Context, body io.Reader, options
} }
if options.ServiceContainer && ranContainers { if options.ServiceContainer && ranContainers {
// We can consider the service to be up and running now. message := fmt.Sprintf("MAINPID=%d\n%s", os.Getpid(), daemon.SdNotifyReady)
// Send the sd-notify messages pointing systemd to the
// service container.
data, err := serviceContainer.Inspect(false)
if err != nil {
return nil, err
}
message := fmt.Sprintf("MAINPID=%d\n%s", data.State.ConmonPid, daemon.SdNotifyReady)
if err := notifyproxy.SendMessage("", message); err != nil { if err := notifyproxy.SendMessage("", message); err != nil {
return nil, err return nil, err
} }
if _, err := serviceContainer.Wait(ctx); err != nil {
return nil, fmt.Errorf("waiting for service container: %w", err)
}
} }
return report, nil return report, nil
} }
func (ic *ContainerEngine) playKubeDeployment(ctx context.Context, deploymentYAML *v1apps.Deployment, options entities.PlayKubeOptions, ipIndex *int, configMaps []v1.ConfigMap, serviceContainer *libpod.Container) (*entities.PlayKubeReport, error) { func (ic *ContainerEngine) playKubeDeployment(ctx context.Context, deploymentYAML *v1apps.Deployment, options entities.PlayKubeOptions, ipIndex *int, configMaps []v1.ConfigMap, serviceContainer *libpod.Container) (*entities.PlayKubeReport, []*notifyproxy.NotifyProxy, error) {
var ( var (
deploymentName string deploymentName string
podSpec v1.PodTemplateSpec podSpec v1.PodTemplateSpec
@ -316,7 +333,7 @@ func (ic *ContainerEngine) playKubeDeployment(ctx context.Context, deploymentYAM
deploymentName = deploymentYAML.ObjectMeta.Name deploymentName = deploymentYAML.ObjectMeta.Name
if deploymentName == "" { if deploymentName == "" {
return nil, errors.New("deployment does not have a name") return nil, nil, errors.New("deployment does not have a name")
} }
numReplicas = 1 numReplicas = 1
if deploymentYAML.Spec.Replicas != nil { if deploymentYAML.Spec.Replicas != nil {
@ -325,18 +342,20 @@ func (ic *ContainerEngine) playKubeDeployment(ctx context.Context, deploymentYAM
podSpec = deploymentYAML.Spec.Template podSpec = deploymentYAML.Spec.Template
// create "replicas" number of pods // create "replicas" number of pods
var notifyProxies []*notifyproxy.NotifyProxy
for i = 0; i < numReplicas; i++ { for i = 0; i < numReplicas; i++ {
podName := fmt.Sprintf("%s-pod-%d", deploymentName, i) podName := fmt.Sprintf("%s-pod-%d", deploymentName, i)
podReport, err := ic.playKubePod(ctx, podName, &podSpec, options, ipIndex, deploymentYAML.Annotations, configMaps, serviceContainer) podReport, proxies, err := ic.playKubePod(ctx, podName, &podSpec, options, ipIndex, deploymentYAML.Annotations, configMaps, serviceContainer)
if err != nil { if err != nil {
return nil, fmt.Errorf("encountered while bringing up pod %s: %w", podName, err) return nil, notifyProxies, fmt.Errorf("encountered while bringing up pod %s: %w", podName, err)
} }
report.Pods = append(report.Pods, podReport.Pods...) report.Pods = append(report.Pods, podReport.Pods...)
notifyProxies = append(notifyProxies, proxies...)
} }
return &report, nil return &report, notifyProxies, nil
} }
func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podYAML *v1.PodTemplateSpec, options entities.PlayKubeOptions, ipIndex *int, annotations map[string]string, configMaps []v1.ConfigMap, serviceContainer *libpod.Container) (*entities.PlayKubeReport, error) { func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podYAML *v1.PodTemplateSpec, options entities.PlayKubeOptions, ipIndex *int, annotations map[string]string, configMaps []v1.ConfigMap, serviceContainer *libpod.Container) (*entities.PlayKubeReport, []*notifyproxy.NotifyProxy, error) {
var ( var (
writer io.Writer writer io.Writer
playKubePod entities.PlayKubePod playKubePod entities.PlayKubePod
@ -345,18 +364,18 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
mainSdNotifyMode, err := getSdNotifyMode(annotations, "") mainSdNotifyMode, err := getSdNotifyMode(annotations, "")
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
// Create the secret manager before hand // Create the secret manager before hand
secretsManager, err := ic.Libpod.SecretsManager() secretsManager, err := ic.Libpod.SecretsManager()
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
// Assert the pod has a name // Assert the pod has a name
if podName == "" { if podName == "" {
return nil, fmt.Errorf("pod does not have a name") return nil, nil, fmt.Errorf("pod does not have a name")
} }
podOpt := entities.PodCreateOptions{ podOpt := entities.PodCreateOptions{
@ -366,7 +385,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
} }
podOpt, err = kube.ToPodOpt(ctx, podName, podOpt, podYAML) podOpt, err = kube.ToPodOpt(ctx, podName, podOpt, podYAML)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
// add kube default network if no network is explicitly added // add kube default network if no network is explicitly added
@ -384,7 +403,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
ns, networks, netOpts, err := specgen.ParseNetworkFlag(options.Networks, pastaNetworkNameExists) ns, networks, netOpts, err := specgen.ParseNetworkFlag(options.Networks, pastaNetworkNameExists)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
podOpt.Net.Network = ns podOpt.Net.Network = ns
@ -404,16 +423,16 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
// Validate the userns modes supported. // Validate the userns modes supported.
podOpt.Userns, err = specgen.ParseUserNamespace(options.Userns) podOpt.Userns, err = specgen.ParseUserNamespace(options.Userns)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
// FIXME This is very hard to support properly with a good ux // FIXME This is very hard to support properly with a good ux
if len(options.StaticIPs) > *ipIndex { if len(options.StaticIPs) > *ipIndex {
if !podOpt.Net.Network.IsBridge() { if !podOpt.Net.Network.IsBridge() {
return nil, fmt.Errorf("static ip addresses can only be set when the network mode is bridge: %w", define.ErrInvalidArg) return nil, nil, fmt.Errorf("static ip addresses can only be set when the network mode is bridge: %w", define.ErrInvalidArg)
} }
if len(podOpt.Net.Networks) != 1 { if len(podOpt.Net.Networks) != 1 {
return nil, fmt.Errorf("cannot set static ip addresses for more than network, use netname:ip=<ip> syntax to specify ips for more than network: %w", define.ErrInvalidArg) return nil, nil, fmt.Errorf("cannot set static ip addresses for more than network, use netname:ip=<ip> syntax to specify ips for more than network: %w", define.ErrInvalidArg)
} }
for name, netOpts := range podOpt.Net.Networks { for name, netOpts := range podOpt.Net.Networks {
netOpts.StaticIPs = append(netOpts.StaticIPs, options.StaticIPs[*ipIndex]) netOpts.StaticIPs = append(netOpts.StaticIPs, options.StaticIPs[*ipIndex])
@ -425,10 +444,10 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
} }
if len(options.StaticMACs) > *ipIndex { if len(options.StaticMACs) > *ipIndex {
if !podOpt.Net.Network.IsBridge() { if !podOpt.Net.Network.IsBridge() {
return nil, fmt.Errorf("static mac address can only be set when the network mode is bridge: %w", define.ErrInvalidArg) return nil, nil, fmt.Errorf("static mac address can only be set when the network mode is bridge: %w", define.ErrInvalidArg)
} }
if len(podOpt.Net.Networks) != 1 { if len(podOpt.Net.Networks) != 1 {
return nil, fmt.Errorf("cannot set static mac address for more than network, use netname:mac=<mac> syntax to specify mac for more than network: %w", define.ErrInvalidArg) return nil, nil, fmt.Errorf("cannot set static mac address for more than network, use netname:mac=<mac> syntax to specify mac for more than network: %w", define.ErrInvalidArg)
} }
for name, netOpts := range podOpt.Net.Networks { for name, netOpts := range podOpt.Net.Networks {
netOpts.StaticMAC = nettypes.HardwareAddr(options.StaticMACs[*ipIndex]) netOpts.StaticMAC = nettypes.HardwareAddr(options.StaticMACs[*ipIndex])
@ -442,12 +461,12 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
p := specgen.NewPodSpecGenerator() p := specgen.NewPodSpecGenerator()
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
p, err = entities.ToPodSpecGen(*p, &podOpt) p, err = entities.ToPodSpecGen(*p, &podOpt)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
podSpec := entities.PodSpec{PodSpecGen: *p} podSpec := entities.PodSpec{PodSpecGen: *p}
@ -458,17 +477,17 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
for _, p := range options.ConfigMaps { for _, p := range options.ConfigMaps {
f, err := os.Open(p) f, err := os.Open(p)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
defer f.Close() defer f.Close()
cm, err := readConfigMapFromFile(f) cm, err := readConfigMapFromFile(f)
if err != nil { if err != nil {
return nil, fmt.Errorf("%q: %w", p, err) return nil, nil, fmt.Errorf("%q: %w", p, err)
} }
if _, present := configMapIndex[cm.Name]; present { if _, present := configMapIndex[cm.Name]; present {
return nil, fmt.Errorf("ambiguous configuration: the same config map %s is present in YAML and in --configmaps %s file", cm.Name, p) return nil, nil, fmt.Errorf("ambiguous configuration: the same config map %s is present in YAML and in --configmaps %s file", cm.Name, p)
} }
configMaps = append(configMaps, cm) configMaps = append(configMaps, cm)
@ -476,7 +495,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
volumes, err := kube.InitializeVolumes(podYAML.Spec.Volumes, configMaps, secretsManager) volumes, err := kube.InitializeVolumes(podYAML.Spec.Volumes, configMaps, secretsManager)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
// Go through the volumes and create a podman volume for all volumes that have been // Go through the volumes and create a podman volume for all volumes that have been
@ -490,27 +509,27 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
// error out instead reuse the current volume. // error out instead reuse the current volume.
vol, err = ic.Libpod.GetVolume(v.Source) vol, err = ic.Libpod.GetVolume(v.Source)
if err != nil { if err != nil {
return nil, fmt.Errorf("cannot re-use local volume for volume from configmap %q: %w", v.Source, err) return nil, nil, fmt.Errorf("cannot re-use local volume for volume from configmap %q: %w", v.Source, err)
} }
} else { } else {
return nil, fmt.Errorf("cannot create a local volume for volume from configmap %q: %w", v.Source, err) return nil, nil, fmt.Errorf("cannot create a local volume for volume from configmap %q: %w", v.Source, err)
} }
} }
mountPoint, err := vol.MountPoint() mountPoint, err := vol.MountPoint()
if err != nil || mountPoint == "" { if err != nil || mountPoint == "" {
return nil, fmt.Errorf("unable to get mountpoint of volume %q: %w", vol.Name(), err) return nil, nil, fmt.Errorf("unable to get mountpoint of volume %q: %w", vol.Name(), err)
} }
// Create files and add data to the volume mountpoint based on the Items in the volume // Create files and add data to the volume mountpoint based on the Items in the volume
for k, v := range v.Items { for k, v := range v.Items {
dataPath := filepath.Join(mountPoint, k) dataPath := filepath.Join(mountPoint, k)
f, err := os.Create(dataPath) f, err := os.Create(dataPath)
if err != nil { if err != nil {
return nil, fmt.Errorf("cannot create file %q at volume mountpoint %q: %w", k, mountPoint, err) return nil, nil, fmt.Errorf("cannot create file %q at volume mountpoint %q: %w", k, mountPoint, err)
} }
defer f.Close() defer f.Close()
_, err = f.Write(v) _, err = f.Write(v)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
} }
} }
@ -518,7 +537,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
seccompPaths, err := kube.InitializeSeccompPaths(podYAML.ObjectMeta.Annotations, options.SeccompProfileRoot) seccompPaths, err := kube.InitializeSeccompPaths(podYAML.ObjectMeta.Annotations, options.SeccompProfileRoot)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
var ctrRestartPolicy string var ctrRestartPolicy string
@ -546,7 +565,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
err = specgenutil.FillOutSpecGen(podSpec.PodSpecGen.InfraContainerSpec, &infraOptions, []string{}) err = specgenutil.FillOutSpecGen(podSpec.PodSpecGen.InfraContainerSpec, &infraOptions, []string{})
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
} }
@ -557,12 +576,12 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
// Create the Pod // Create the Pod
pod, err := generate.MakePod(&podSpec, ic.Libpod) pod, err := generate.MakePod(&podSpec, ic.Libpod)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
podInfraID, err := pod.InfraContainerID() podInfraID, err := pod.InfraContainerID()
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
if !options.Quiet { if !options.Quiet {
@ -578,7 +597,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
} else { } else {
cwd, err = os.Getwd() cwd, err = os.Getwd()
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
} }
@ -586,16 +605,16 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
for _, initCtr := range podYAML.Spec.InitContainers { for _, initCtr := range podYAML.Spec.InitContainers {
// Error out if same name is used for more than one container // Error out if same name is used for more than one container
if _, ok := ctrNames[initCtr.Name]; ok { if _, ok := ctrNames[initCtr.Name]; ok {
return nil, fmt.Errorf("the pod %q is invalid; duplicate container name %q detected", podName, initCtr.Name) return nil, nil, fmt.Errorf("the pod %q is invalid; duplicate container name %q detected", podName, initCtr.Name)
} }
ctrNames[initCtr.Name] = "" ctrNames[initCtr.Name] = ""
// Init containers cannot have either of lifecycle, livenessProbe, readinessProbe, or startupProbe set // Init containers cannot have either of lifecycle, livenessProbe, readinessProbe, or startupProbe set
if initCtr.Lifecycle != nil || initCtr.LivenessProbe != nil || initCtr.ReadinessProbe != nil || initCtr.StartupProbe != nil { if initCtr.Lifecycle != nil || initCtr.LivenessProbe != nil || initCtr.ReadinessProbe != nil || initCtr.StartupProbe != nil {
return nil, fmt.Errorf("cannot create an init container that has either of lifecycle, livenessProbe, readinessProbe, or startupProbe set") return nil, nil, fmt.Errorf("cannot create an init container that has either of lifecycle, livenessProbe, readinessProbe, or startupProbe set")
} }
pulledImage, labels, err := ic.getImageAndLabelInfo(ctx, cwd, annotations, writer, initCtr, options) pulledImage, labels, err := ic.getImageAndLabelInfo(ctx, cwd, annotations, writer, initCtr, options)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
for k, v := range podSpec.PodSpecGen.Labels { // add podYAML labels for k, v := range podSpec.PodSpecGen.Labels { // add podYAML labels
@ -628,17 +647,17 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
} }
specGen, err := kube.ToSpecGen(ctx, &specgenOpts) specGen, err := kube.ToSpecGen(ctx, &specgenOpts)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
specGen.SdNotifyMode = define.SdNotifyModeIgnore specGen.SdNotifyMode = define.SdNotifyModeIgnore
rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil) rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore)) opts = append(opts, libpod.WithSdNotifyMode(define.SdNotifyModeIgnore))
ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...) ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
initContainers = append(initContainers, ctr) initContainers = append(initContainers, ctr)
@ -649,12 +668,12 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
for _, container := range podYAML.Spec.Containers { for _, container := range podYAML.Spec.Containers {
// Error out if the same name is used for more than one container // Error out if the same name is used for more than one container
if _, ok := ctrNames[container.Name]; ok { if _, ok := ctrNames[container.Name]; ok {
return nil, fmt.Errorf("the pod %q is invalid; duplicate container name %q detected", podName, container.Name) return nil, nil, fmt.Errorf("the pod %q is invalid; duplicate container name %q detected", podName, container.Name)
} }
ctrNames[container.Name] = "" ctrNames[container.Name] = ""
pulledImage, labels, err := ic.getImageAndLabelInfo(ctx, cwd, annotations, writer, container, options) pulledImage, labels, err := ic.getImageAndLabelInfo(ctx, cwd, annotations, writer, container, options)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
for k, v := range podSpec.PodSpecGen.Labels { // add podYAML labels for k, v := range podSpec.PodSpecGen.Labels { // add podYAML labels
@ -683,18 +702,18 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
specGen, err := kube.ToSpecGen(ctx, &specgenOpts) specGen, err := kube.ToSpecGen(ctx, &specgenOpts)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
specGen.RawImageName = container.Image specGen.RawImageName = container.Image
rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil) rtSpec, spec, opts, err := generate.MakeContainer(ctx, ic.Libpod, specGen, false, nil)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
sdNotifyMode := mainSdNotifyMode sdNotifyMode := mainSdNotifyMode
ctrNotifyMode, err := getSdNotifyMode(annotations, container.Name) ctrNotifyMode, err := getSdNotifyMode(annotations, container.Name)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
if ctrNotifyMode != "" { if ctrNotifyMode != "" {
sdNotifyMode = ctrNotifyMode sdNotifyMode = ctrNotifyMode
@ -710,7 +729,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
if sdNotifyMode != "" && sdNotifyMode != define.SdNotifyModeIgnore { if sdNotifyMode != "" && sdNotifyMode != define.SdNotifyModeIgnore {
proxy, err = notifyproxy.New("") proxy, err = notifyproxy.New("")
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
sdNotifyProxies = append(sdNotifyProxies, proxy) sdNotifyProxies = append(sdNotifyProxies, proxy)
opts = append(opts, libpod.WithSdNotifySocket(proxy.SocketPath())) opts = append(opts, libpod.WithSdNotifySocket(proxy.SocketPath()))
@ -718,7 +737,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...) ctr, err := generate.ExecuteCreate(ctx, ic.Libpod, rtSpec, spec, false, opts...)
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
if proxy != nil { if proxy != nil {
proxy.AddContainer(ctr) proxy.AddContainer(ctr)
@ -730,7 +749,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
// Start the containers // Start the containers
podStartErrors, err := pod.Start(ctx) podStartErrors, err := pod.Start(ctx)
if err != nil && !errors.Is(err, define.ErrPodPartialFail) { if err != nil && !errors.Is(err, define.ErrPodPartialFail) {
return nil, err return nil, nil, err
} }
for id, err := range podStartErrors { for id, err := range podStartErrors {
playKubePod.ContainerErrors = append(playKubePod.ContainerErrors, fmt.Errorf("starting container %s: %w", id, err).Error()) playKubePod.ContainerErrors = append(playKubePod.ContainerErrors, fmt.Errorf("starting container %s: %w", id, err).Error())
@ -743,8 +762,13 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
errors := make([]error, len(sdNotifyProxies)) errors := make([]error, len(sdNotifyProxies))
for i := range sdNotifyProxies { for i := range sdNotifyProxies {
wg.Add(1) wg.Add(1)
defer func() {
if err := sdNotifyProxies[i].Close(); err != nil {
logrus.Errorf("Closing sdnotify proxy %q: %v", sdNotifyProxies[i].SocketPath(), err)
}
}()
go func(i int) { go func(i int) {
err := sdNotifyProxies[i].WaitAndClose() err := sdNotifyProxies[i].Wait()
if err != nil { if err != nil {
err = fmt.Errorf("waiting for sd-notify proxy: %w", err) err = fmt.Errorf("waiting for sd-notify proxy: %w", err)
} }
@ -755,7 +779,11 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
wg.Wait() wg.Wait()
for _, err := range errors { for _, err := range errors {
if err != nil { if err != nil {
return nil, err // Close all proxies on error.
for _, proxy := range sdNotifyProxies {
_ = proxy.Close()
}
return nil, nil, err
} }
} }
} }
@ -770,7 +798,7 @@ func (ic *ContainerEngine) playKubePod(ctx context.Context, podName string, podY
report.Pods = append(report.Pods, playKubePod) report.Pods = append(report.Pods, playKubePod)
return &report, nil return &report, sdNotifyProxies, nil
} }
// getImageAndLabelInfo returns the image information and how the image should be pulled plus as well as labels to be used for the container in the pod. // getImageAndLabelInfo returns the image information and how the image should be pulled plus as well as labels to be used for the container in the pod.

View File

@ -14,6 +14,16 @@ import (
"github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/define"
"github.com/coreos/go-systemd/v22/daemon" "github.com/coreos/go-systemd/v22/daemon"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
const (
// All constants below are defined by systemd.
_notifyRcvbufSize = 8 * 1024 * 1024
_notifyBufferMax = 4096
_notifyFdMax = 768
_notifyBarrierMsg = "BARRIER=1"
_notifyRdyMsg = daemon.SdNotifyReady
) )
// SendMessage sends the specified message to the specified socket. // SendMessage sends the specified message to the specified socket.
@ -76,6 +86,10 @@ func New(tmpDir string) (*NotifyProxy, error) {
return nil, err return nil, err
} }
if err := conn.SetReadBuffer(_notifyRcvbufSize); err != nil {
return nil, fmt.Errorf("setting read buffer: %w", err)
}
errorChan := make(chan error, 1) errorChan := make(chan error, 1)
readyChan := make(chan bool, 1) readyChan := make(chan bool, 1)
@ -100,35 +114,70 @@ func (p *NotifyProxy) waitForReady() {
go func() { go func() {
// Read until the `READY` message is received or the connection // Read until the `READY` message is received or the connection
// is closed. // is closed.
const bufferSize = 1024
// See https://github.com/containers/podman/issues/16515 for a description of the protocol.
fdSize := unix.CmsgSpace(4)
buffer := make([]byte, _notifyBufferMax)
oob := make([]byte, _notifyFdMax*fdSize)
sBuilder := strings.Builder{} sBuilder := strings.Builder{}
for { for {
for { n, oobn, flags, _, err := p.connection.ReadMsgUnix(buffer, oob)
buffer := make([]byte, bufferSize)
num, err := p.connection.Read(buffer)
if err != nil { if err != nil {
if !errors.Is(err, io.EOF) { if !errors.Is(err, io.EOF) {
p.errorChan <- err p.errorChan <- err
return return
} }
logrus.Errorf("Error reading unix message on socket %q: %v", p.socketPath, err)
} }
sBuilder.Write(buffer[:num])
if num != bufferSize || buffer[num-1] == '\n' { if n > _notifyBufferMax || oobn > _notifyFdMax*fdSize {
// Break as we read an entire line that logrus.Errorf("Ignoring unix message on socket %q: incorrect number of bytes read (n=%d, oobn=%d)", p.socketPath, n, oobn)
// we can inspect for the `READY` continue
// message. }
break
if flags&unix.MSG_CTRUNC != 0 {
logrus.Errorf("Ignoring unix message on socket %q: message truncated", p.socketPath)
continue
}
sBuilder.Reset()
sBuilder.Write(buffer[:n])
var isBarrier, isReady bool
for _, line := range strings.Split(sBuilder.String(), "\n") {
switch line {
case _notifyRdyMsg:
isReady = true
case _notifyBarrierMsg:
isBarrier = true
} }
} }
for _, line := range strings.Split(sBuilder.String(), "\n") { if isBarrier {
if line == daemon.SdNotifyReady { scms, err := unix.ParseSocketControlMessage(oob)
if err != nil {
logrus.Errorf("parsing control message on socket %q: %v", p.socketPath, err)
}
for _, scm := range scms {
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
logrus.Errorf("parsing unix rights of control message on socket %q: %v", p.socketPath, err)
continue
}
for _, fd := range fds {
if err := unix.Close(fd); err != nil {
logrus.Errorf("closing fd passed on socket %q: %v", fd, err)
continue
}
}
}
continue
}
if isReady {
p.readyChan <- true p.readyChan <- true
return
} }
} }
sBuilder.Reset()
}
}() }()
} }
@ -137,8 +186,8 @@ func (p *NotifyProxy) SocketPath() string {
return p.socketPath return p.socketPath
} }
// close closes the listener and removes the socket. // Close closes the listener and removes the socket.
func (p *NotifyProxy) close() error { func (p *NotifyProxy) Close() error {
defer os.Remove(p.socketPath) defer os.Remove(p.socketPath)
return p.connection.Close() return p.connection.Close()
} }
@ -158,20 +207,12 @@ type Container interface {
ID() string ID() string
} }
// WaitAndClose waits until receiving the `READY` notify message and close the // WaitAndClose waits until receiving the `READY` notify message. Note that the
// listener. Note that the this function must only be executed inside a systemd // this function must only be executed inside a systemd service which will kill
// service which will kill the process after a given timeout. // the process after a given timeout. If the (optional) container stopped
// If the (optional) container stopped running before the `READY` is received, // running before the `READY` is received, the waiting gets canceled and
// the waiting gets canceled and ErrNoReadyMessage is returned. // ErrNoReadyMessage is returned.
func (p *NotifyProxy) WaitAndClose() error { func (p *NotifyProxy) Wait() error {
defer func() {
// Closing the socket/connection makes sure that the other
// goroutine reading/waiting for the READY message returns.
if err := p.close(); err != nil {
logrus.Errorf("Closing notify proxy: %v", err)
}
}()
// If the proxy has a container we need to watch it as it may exit // If the proxy has a container we need to watch it as it may exit
// without sending a READY message. The goroutine below returns when // without sending a READY message. The goroutine below returns when
// the container exits OR when the function returns (see deferred the // the container exits OR when the function returns (see deferred the

View File

@ -18,7 +18,7 @@ func TestNotifyProxy(t *testing.T) {
proxy, err := New("") proxy, err := New("")
require.NoError(t, err) require.NoError(t, err)
require.FileExists(t, proxy.SocketPath()) require.FileExists(t, proxy.SocketPath())
require.NoError(t, proxy.close()) require.NoError(t, proxy.Close())
require.NoFileExists(t, proxy.SocketPath()) require.NoFileExists(t, proxy.SocketPath())
} }
@ -28,9 +28,12 @@ func TestWaitAndClose(t *testing.T) {
require.FileExists(t, proxy.SocketPath()) require.FileExists(t, proxy.SocketPath())
ch := make(chan error) ch := make(chan error)
defer func() {
err := proxy.Close()
require.NoError(t, err, "proxy should close successfully")
}()
go func() { go func() {
ch <- proxy.WaitAndClose() ch <- proxy.Wait()
}() }()
sendMessage(t, proxy, "foo\n") sendMessage(t, proxy, "foo\n")

View File

@ -397,6 +397,10 @@ EOF
systemctl start $service_name systemctl start $service_name
systemctl is-active $service_name systemctl is-active $service_name
# Make sure that Podman is the service's MainPID
run systemctl show --property=MainPID --value $service_name
is "$(</proc/$output/comm)" "podman" "podman is the service mainPID"
# The name of the service container is predictable: the first 12 characters # The name of the service container is predictable: the first 12 characters
# of the hash of the YAML file followed by the "-service" suffix # of the hash of the YAML file followed by the "-service" suffix
yaml_sha=$(sha256sum $yaml_source) yaml_sha=$(sha256sum $yaml_source)
@ -422,13 +426,13 @@ EOF
# container. # container.
run_podman pod kill test_pod run_podman pod kill test_pod
for i in {0..5}; do for i in {0..5}; do
run systemctl is-failed $service_name run systemctl is-active $service_name
if [[ $output == "failed" ]]; then if [[ $output == "inactive" ]]; then
break break
fi fi
sleep 0.5 sleep 0.5
done done
is "$output" "failed" "systemd service transitioned to 'failed' state" is "$output" "inactive" "systemd service transitioned to 'inactive' state: $service_name"
# Now stop and start the service again. # Now stop and start the service again.
systemctl stop $service_name systemctl stop $service_name

View File

@ -206,9 +206,10 @@ metadata:
app: test app: test
name: test_pod name: test_pod
spec: spec:
restartPolicy: "Never"
containers: containers:
- command: - command:
- top - true
image: $IMAGE image: $IMAGE
name: test name: test
resources: {} resources: {}
@ -219,26 +220,26 @@ EOF
yaml_sha=$(sha256sum $yaml_source) yaml_sha=$(sha256sum $yaml_source)
service_container="${yaml_sha:0:12}-service" service_container="${yaml_sha:0:12}-service"
export NOTIFY_SOCKET=$PODMAN_TMPDIR/conmon.sock export NOTIFY_SOCKET=$PODMAN_TMPDIR/conmon.sock
_start_socat _start_socat
wait_for_file $_SOCAT_LOG
# Will run until all containers have stopped.
run_podman play kube --service-container=true $yaml_source run_podman play kube --service-container=true $yaml_source
run_podman container wait $service_container test_pod-test
# Make sure the containers have the correct policy. # Make sure the containers have the correct policy.
run_podman container inspect test_pod-test $service_container --format "{{.Config.SdNotifyMode}}" run_podman container inspect test_pod-test $service_container --format "{{.Config.SdNotifyMode}}"
is "$output" "ignore is "$output" "ignore
ignore" ignore"
run_podman container inspect $service_container --format "{{.State.ConmonPid}}"
mainPID="$output"
wait_for_file $_SOCAT_LOG
# The 'echo's help us debug failed runs # The 'echo's help us debug failed runs
run cat $_SOCAT_LOG run cat $_SOCAT_LOG
echo "socat log:" echo "socat log:"
echo "$output" echo "$output"
is "$output" "MAINPID=$mainPID # The "with policies" test below checks the MAINPID.
is "$output" "MAINPID=.*
READY=1" "sdnotify sent MAINPID and READY" READY=1" "sdnotify sent MAINPID and READY"
_stop_socat _stop_socat
@ -268,11 +269,11 @@ metadata:
labels: labels:
app: test app: test
name: test_pod name: test_pod
restartPolicy: "Never"
annotations: annotations:
io.containers.sdnotify: "container" io.containers.sdnotify: "container"
io.containers.sdnotify/b: "conmon" io.containers.sdnotify/b: "conmon"
spec: spec:
restartPolicy: "Never"
containers: containers:
- command: - command:
- /bin/sh - /bin/sh
@ -333,14 +334,13 @@ ignore"
run_podman logs $container_a run_podman logs $container_a
is "${lines[0]}" "/run/notify/notify.sock" "NOTIFY_SOCKET is passed to container" is "${lines[0]}" "/run/notify/notify.sock" "NOTIFY_SOCKET is passed to container"
# Send the READY message. Doing it in an exec session helps debug
# potential issues.
run_podman exec --env NOTIFY_SOCKET="/run/notify/notify.sock" $container_a /usr/bin/systemd-notify --ready run_podman exec --env NOTIFY_SOCKET="/run/notify/notify.sock" $container_a /usr/bin/systemd-notify --ready
# Instruct the container to send the READY # Instruct the container to stop
run_podman exec $container_a /bin/touch /stop run_podman exec $container_a /bin/touch /stop
run_podman container inspect $service_container --format "{{.State.ConmonPid}}"
main_pid="$output"
run_podman container wait $container_a run_podman container wait $container_a
run_podman container inspect $container_a --format "{{.State.ExitCode}}" run_podman container inspect $container_a --format "{{.State.ExitCode}}"
is "$output" "0" "container exited cleanly after sending READY message" is "$output" "0" "container exited cleanly after sending READY message"
@ -350,9 +350,12 @@ ignore"
echo "socat log:" echo "socat log:"
echo "$output" echo "$output"
is "$output" "MAINPID=$main_pid is "$output" "MAINPID=.*
READY=1" "sdnotify sent MAINPID and READY" READY=1" "sdnotify sent MAINPID and READY"
# Make sure that Podman is the service's MainPID
main_pid=$(awk -F= '{print $2}' <<< ${lines[0]})
is "$(</proc/$main_pid/comm)" "podman" "podman is the service mainPID"
_stop_socat _stop_socat
# Clean up pod and pause image # Clean up pod and pause image

View File

@ -123,7 +123,25 @@ spec:
name: test name: test
resources: {} resources: {}
EOF EOF
run_podman play kube --service-container=true $yaml_source # Run `play kube` in the background as it will wait for the service
# container to exit.
timeout --foreground -v --kill=10 60 \
$PODMAN play kube --service-container=true $yaml_source &>/dev/null &
# Wait for the container to be running
container_a=test_pod-test
for i in $(seq 1 20); do
run_podman "?" container wait $container_a --condition="running"
if [[ $status == 0 ]]; then
break
fi
sleep 0.5
# Just for debugging
run_podman ps -a
done
if [[ $status != 0 ]]; then
die "container $container_a did not start"
fi
# The name of the service container is predictable: the first 12 characters # The name of the service container is predictable: the first 12 characters
# of the hash of the YAML file followed by the "-service" suffix # of the hash of the YAML file followed by the "-service" suffix