Add a net health recovery service to Qemu machines

There is a network stability issue in qemu + virtio, affecting
some users after long periods of usage, which can lead to
suspended queue delivery. Until the issue is resolved, add a
temporary recovery service which restarts networking when host
communication becomes inoperable.

[NO NEW TESTS NEEDED]

Signed-off-by: Jason T. Greene <jason.greene@redhat.com>
This commit is contained in:
Jason T. Greene 2024-01-15 17:53:30 -06:00
parent f1ea4fbb3d
commit 79fad91dbb
6 changed files with 100 additions and 19 deletions

View File

@ -62,6 +62,7 @@ type DynamicIgnition struct {
WritePath string WritePath string
Cfg Config Cfg Config
Rootful bool Rootful bool
NetRecover bool
} }
func (ign *DynamicIgnition) Write() error { func (ign *DynamicIgnition) Write() error {
@ -97,7 +98,7 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error {
ignStorage := Storage{ ignStorage := Storage{
Directories: getDirs(ign.Name), Directories: getDirs(ign.Name),
Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType), Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType, ign.NetRecover),
Links: getLinks(ign.Name), Links: getLinks(ign.Name),
} }
@ -231,6 +232,21 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error {
} }
ignSystemd.Units = append(ignSystemd.Units, qemuUnit) ignSystemd.Units = append(ignSystemd.Units, qemuUnit)
} }
if ign.NetRecover {
contents, err := GetNetRecoveryUnitFile().ToString()
if err != nil {
return err
}
recoveryUnit := Unit{
Enabled: BoolToPtr(true),
Name: "net-health-recovery.service",
Contents: &contents,
}
ignSystemd.Units = append(ignSystemd.Units, recoveryUnit)
}
// Only after all checks are done // Only after all checks are done
// it's ready create the ingConfig // it's ready create the ingConfig
ign.Cfg = Config{ ign.Cfg = Config{
@ -303,7 +319,7 @@ func getDirs(usrName string) []Directory {
return dirs return dirs
} }
func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType) []File { func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType, netRecover bool) []File {
files := make([]File, 0) files := make([]File, 0)
lingerExample := parser.NewUnitFile() lingerExample := parser.NewUnitFile()
@ -574,6 +590,23 @@ Delegate=memory pids cpu io
}, },
}) })
// Only necessary for qemu on mac
if netRecover {
files = append(files, File{
Node: Node{
User: GetNodeUsr("root"),
Group: GetNodeGrp("root"),
Path: "/usr/local/bin/net-health-recovery.sh",
},
FileEmbedded1: FileEmbedded1{
Mode: IntToPtr(0755),
Contents: Resource{
Source: EncodeDataURLPtr(GetNetRecoveryFile()),
},
},
})
}
return files return files
} }
@ -743,6 +776,37 @@ func (i *IgnitionBuilder) Build() error {
return i.dynamicIgnition.Write() return i.dynamicIgnition.Write()
} }
func GetNetRecoveryFile() string {
return `#!/bin/bash
# Verify network health, and bounce the network device if host connectivity
# is lost. This is a temporary workaround for a known rare qemu/virtio issue
# that affects some systems
sleep 120 # allow time for network setup on initial boot
while true; do
sleep 30
curl -s -o /dev/null --max-time 30 http://192.168.127.1/health
if [ "$?" != "0" ]; then
echo "bouncing nic due to loss of connectivity with host"
ifconfig enp0s1 down; ifconfig enp0s1 up
fi
done
`
}
func GetNetRecoveryUnitFile() *parser.UnitFile {
recoveryUnit := parser.NewUnitFile()
recoveryUnit.Add("Unit", "Description", "Verifies health of network and recovers if necessary")
recoveryUnit.Add("Unit", "After", "sshd.socket sshd.service")
recoveryUnit.Add("Service", "ExecStart", "/usr/local/bin/net-health-recovery.sh")
recoveryUnit.Add("Service", "StandardOutput", "journal")
recoveryUnit.Add("Service", "StandardError", "journal")
recoveryUnit.Add("Service", "StandardInput", "null")
recoveryUnit.Add("Install", "WantedBy", "default.target")
return recoveryUnit
}
func DefaultReadyUnitFile() parser.UnitFile { func DefaultReadyUnitFile() parser.UnitFile {
u := parser.NewUnitFile() u := parser.NewUnitFile()
u.Add("Unit", "After", "remove-moby.service sshd.socket sshd.service") u.Add("Unit", "After", "remove-moby.service sshd.socket sshd.service")

View File

@ -202,6 +202,7 @@ func (v *MachineVM) Init(opts machine.InitOptions) (bool, error) {
WritePath: v.getIgnitionFile(), WritePath: v.getIgnitionFile(),
UID: v.UID, UID: v.UID,
Rootful: v.Rootful, Rootful: v.Rootful,
NetRecover: useNetworkRecover(),
}) })
// If the user provides an ignition file, we need to // If the user provides an ignition file, we need to

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
} }
return tmpDir, nil return tmpDir, nil
} }
func useNetworkRecover() bool {
return true
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
} }
return tmpDir, nil return tmpDir, nil
} }
func useNetworkRecover() bool {
return false
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
} }
return util.GetRootlessRuntimeDir() return util.GetRootlessRuntimeDir()
} }
func useNetworkRecover() bool {
return false
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
} }
return tmpDir, nil return tmpDir, nil
} }
func useNetworkRecover() bool {
return false
}