Fix 2 bugs in pid1
1) Was calling Wait() on the child process, but also calling Wait4, which would race, causing an occasional error or panic. 2) In testing (1), I observed occasional hangs. Tracing it down to a SIGWINCH, which masked a SIGCHLD, causing it to hang. Both seem fixed. Added a manual test script.
This commit is contained in:
parent
8015d4b24e
commit
95a1690e6f
|
|
@ -319,7 +319,7 @@ func main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// From here on, output goes through logging.
|
// From here on, output goes through logging.
|
||||||
log.V(0).Info("starting up", "args", os.Args)
|
log.V(0).Info("starting up", "pid", os.Getpid(), "args", os.Args)
|
||||||
|
|
||||||
// Startup webhooks goroutine
|
// Startup webhooks goroutine
|
||||||
var webhook *Webhook
|
var webhook *Webhook
|
||||||
|
|
|
||||||
|
|
@ -25,27 +25,30 @@ func ReRun() error {
|
||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
go runInit(cmd.Process.Pid)
|
runInit(cmd.Process.Pid)
|
||||||
return cmd.Wait()
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// runInit runs a bare-bones init process. This will never return. In case of
|
// runInit runs a bare-bones init process. This will return when firstborn
|
||||||
// truly unknown errors it will panic.
|
// exits. In case of truly unknown errors it will panic.
|
||||||
func runInit(pid int) {
|
func runInit(firstborn int) {
|
||||||
sigs := make(chan os.Signal, 8)
|
sigs := make(chan os.Signal, 8)
|
||||||
signal.Notify(sigs)
|
signal.Notify(sigs)
|
||||||
for sig := range sigs {
|
for sig := range sigs {
|
||||||
if sig == syscall.SIGCHLD {
|
if sig != syscall.SIGCHLD {
|
||||||
sigchld()
|
|
||||||
} else {
|
|
||||||
// Pass it on to the real process.
|
// Pass it on to the real process.
|
||||||
syscall.Kill(pid, sig.(syscall.Signal))
|
syscall.Kill(firstborn, sig.(syscall.Signal))
|
||||||
|
}
|
||||||
|
// Always try to reap a child - empirically, sometimes this gets missed.
|
||||||
|
if sigchld(firstborn) {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// sigchld handles a SIGCHLD.
|
// sigchld handles a SIGCHLD. This will return true when firstborn exits. In
|
||||||
func sigchld() {
|
// case of truly unknown errors it will panic.
|
||||||
|
func sigchld(firstborn int) bool {
|
||||||
// Loop to handle multiple child processes.
|
// Loop to handle multiple child processes.
|
||||||
for {
|
for {
|
||||||
var status syscall.WaitStatus
|
var status syscall.WaitStatus
|
||||||
|
|
@ -53,10 +56,15 @@ func sigchld() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(fmt.Sprintf("failed to wait4(): %v\n", err))
|
panic(fmt.Sprintf("failed to wait4(): %v\n", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if pid == firstborn {
|
||||||
|
return true
|
||||||
|
}
|
||||||
if pid <= 0 {
|
if pid <= 0 {
|
||||||
// No more children to reap.
|
// No more children to reap.
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
// Must have found one, see if there are more.
|
// Must have found one, see if there are more.
|
||||||
}
|
}
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
fast-exit
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
FROM debian
|
||||||
|
RUN apt-get update && apt-get install -y bash procps psmisc psutils
|
||||||
|
COPY fast-exit /fast-exit
|
||||||
|
ENTRYPOINT ["/fast-exit"]
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
// A do-nothing app to test pid1.ReRun().
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
|
||||||
|
"k8s.io/git-sync/pkg/pid1"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// In case we come up as pid 1, act as init.
|
||||||
|
if os.Getpid() == 1 {
|
||||||
|
fmt.Printf("detected pid 1, running as init\n")
|
||||||
|
err := pid1.ReRun()
|
||||||
|
if err == nil {
|
||||||
|
os.Exit(0)
|
||||||
|
}
|
||||||
|
if exerr, ok := err.(*exec.ExitError); ok {
|
||||||
|
os.Exit(exerr.ExitCode())
|
||||||
|
}
|
||||||
|
fmt.Printf("unhandled pid1 error: %v\n", err)
|
||||||
|
os.Exit(127)
|
||||||
|
}
|
||||||
|
fmt.Printf("main app\n")
|
||||||
|
os.Exit(42)
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
go build
|
||||||
|
docker build -t example.com/fast-exit .
|
||||||
|
|
||||||
|
# In the past we have observed hangs and missed signals. This *should* run
|
||||||
|
# forever.
|
||||||
|
while true; do
|
||||||
|
docker run -ti --rm example.com/fast-exit
|
||||||
|
done
|
||||||
Loading…
Reference in New Issue