Fix 2 bugs in pid1

1) Was calling Wait() on the child process, but also calling Wait4,
which would race, causing an occasional error or panic.

2) In testing (1), I observed occasional hangs.  Tracing it down to a
SIGWINCH, which masked a SIGCHLD, causing it to hang.  Both seem fixed.

Added a manual test script.
This commit is contained in:
Tim Hockin 2020-03-09 20:26:29 -07:00
parent 8015d4b24e
commit 95a1690e6f
6 changed files with 63 additions and 12 deletions

View File

@ -319,7 +319,7 @@ func main() {
}
// From here on, output goes through logging.
log.V(0).Info("starting up", "args", os.Args)
log.V(0).Info("starting up", "pid", os.Getpid(), "args", os.Args)
// Startup webhooks goroutine
var webhook *Webhook

View File

@ -25,27 +25,30 @@ func ReRun() error {
if err := cmd.Start(); err != nil {
return err
}
go runInit(cmd.Process.Pid)
return cmd.Wait()
runInit(cmd.Process.Pid)
return nil
}
// runInit runs a bare-bones init process. This will never return. In case of
// truly unknown errors it will panic.
func runInit(pid int) {
// runInit runs a bare-bones init process. This will return when firstborn
// exits. In case of truly unknown errors it will panic.
func runInit(firstborn int) {
sigs := make(chan os.Signal, 8)
signal.Notify(sigs)
for sig := range sigs {
if sig == syscall.SIGCHLD {
sigchld()
} else {
if sig != syscall.SIGCHLD {
// Pass it on to the real process.
syscall.Kill(pid, sig.(syscall.Signal))
syscall.Kill(firstborn, sig.(syscall.Signal))
}
// Always try to reap a child - empirically, sometimes this gets missed.
if sigchld(firstborn) {
return
}
}
}
// sigchld handles a SIGCHLD.
func sigchld() {
// sigchld handles a SIGCHLD. This will return true when firstborn exits. In
// case of truly unknown errors it will panic.
func sigchld(firstborn int) bool {
// Loop to handle multiple child processes.
for {
var status syscall.WaitStatus
@ -53,10 +56,15 @@ func sigchld() {
if err != nil {
panic(fmt.Sprintf("failed to wait4(): %v\n", err))
}
if pid == firstborn {
return true
}
if pid <= 0 {
// No more children to reap.
break
}
// Must have found one, see if there are more.
}
return false
}

1
pkg/pid1/test/fast-exit/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
fast-exit

View File

@ -0,0 +1,4 @@
FROM debian
RUN apt-get update && apt-get install -y bash procps psmisc psutils
COPY fast-exit /fast-exit
ENTRYPOINT ["/fast-exit"]

View File

@ -0,0 +1,28 @@
// A do-nothing app to test pid1.ReRun().
package main
import (
"fmt"
"os"
"os/exec"
"k8s.io/git-sync/pkg/pid1"
)
func main() {
// In case we come up as pid 1, act as init.
if os.Getpid() == 1 {
fmt.Printf("detected pid 1, running as init\n")
err := pid1.ReRun()
if err == nil {
os.Exit(0)
}
if exerr, ok := err.(*exec.ExitError); ok {
os.Exit(exerr.ExitCode())
}
fmt.Printf("unhandled pid1 error: %v\n", err)
os.Exit(127)
}
fmt.Printf("main app\n")
os.Exit(42)
}

10
pkg/pid1/test/fast-exit/test.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
go build
docker build -t example.com/fast-exit .
# In the past we have observed hangs and missed signals. This *should* run
# forever.
while true; do
docker run -ti --rm example.com/fast-exit
done