Make log-validator take glob patterns to monitor for log files (#7172)

To simplify deployment of the log validator, this allows wildcards (using go's filepath.Glob) to be included in the file paths. In order to detect new files, a new background goroutine polls the glob patterns every minute for matches. Because the "monitor" function is running in its own goroutine, a lock is needed to ensure it's not trying to add new tailers while shutdown is happening.
2023-11-27 15:48:46 -05:00 · 2023-11-27 15:48:46 -05:00 · 32adaf1846
parent a31429fb3e
commit 32adaf1846
4 changed files with 107 additions and 58 deletions
--- a/cmd/log-validator/main.go
+++ b/cmd/log-validator/main.go
@ -38,14 +38,9 @@ func main() {
 	defer oTelShutdown(context.Background())
 	logger.Info(cmd.VersionString())

-	v := validator.New(logger, stats)
+	v := validator.New(config.Files, logger, stats)
 	defer v.Shutdown()

-	for _, filename := range config.Files {
-		err := v.TailValidateFile(filename)
-		cmd.FailOnError(err, "failed to tail file")
-	}
-
 	cmd.WaitForSignal()
 }

--- a/log/validator/tail_logger.go
+++ b/log/validator/tail_logger.go
@ -6,7 +6,7 @@ import (
 	"github.com/letsencrypt/boulder/log"
 )

-// tailLogger is an adapter to the hpcloud/tail module's logging interface.
+// tailLogger is an adapter to the nxadm/tail module's logging interface.
 type tailLogger struct {
 	log.Logger
 }
--- a/log/validator/validator.go
+++ b/log/validator/validator.go
@ -1,11 +1,14 @@
 package validator

 import (
+	"context"
 	"encoding/base64"
 	"errors"
 	"fmt"
 	"os"
+	"path/filepath"
 	"strings"
+	"sync"
 	"time"

 	"github.com/nxadm/tail"
@ -17,73 +20,132 @@ import (
 var errInvalidChecksum = errors.New("invalid checksum length")

 type Validator struct {
-	tailers []*tail.Tail
+	// mu guards patterns and tailers to prevent Shutdown racing monitor
+	mu sync.Mutex
+
+	// patterns is the list of glob patterns to monitor with filepath.Glob for logs
+	patterns []string
+
+	// tailers is a map of filenames to the tailer which are currently being tailed
+	tailers map[string]*tail.Tail
+
+	// monitorCancel cancels the monitor's context, so it exits
+	monitorCancel context.CancelFunc

 	lineCounter *prometheus.CounterVec
 	log         log.Logger
 }

-func New(logger log.Logger, stats prometheus.Registerer) *Validator {
+// New Validator monitoring paths, which is a list of file globs.
+func New(patterns []string, logger log.Logger, stats prometheus.Registerer) *Validator {
 	lineCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
 		Name: "log_lines",
 		Help: "A counter of log lines processed, with status",
 	}, []string{"filename", "status"})
 	stats.MustRegister(lineCounter)

-	return &Validator{log: logger, lineCounter: lineCounter}
-}
+	monitorContext, monitorCancel := context.WithCancel(context.Background())

-// TailValidateFile takes a filename, and starts tailing it.
-// An error is returned if the file couldn't be opened.
-func (v *Validator) TailValidateFile(filename string) error {
-	t, err := tail.TailFile(filename, tail.Config{
-		ReOpen:        true,
-		MustExist:     false, // sometimes files won't exist, so we must tolerate that
-		Follow:        true,
-		Logger:        tailLogger{v.log},
-		CompleteLines: true,
-	})
-	if err != nil {
-		return err
+	v := &Validator{
+		patterns:      patterns,
+		tailers:       map[string]*tail.Tail{},
+		log:           logger,
+		monitorCancel: monitorCancel,
+		lineCounter:   lineCounter,
 	}

-	go func() {
-		// Emit no more than 1 error line per second. This prevents consuming large
-		// amounts of disk space in case there is problem that causes all log lines to
-		// be invalid.
-		outputLimiter := time.NewTicker(time.Second)
-		defer outputLimiter.Stop()
+	go v.monitor(monitorContext)

-		for line := range t.Lines {
-			if line.Err != nil {
-				v.log.Errf("error while tailing %s: %s", t.Filename, line.Err)
+	return v
+}
+
+// pollPaths expands v.patterns and calls v.tailValidateFile on each resulting file
+func (v *Validator) pollPaths() {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	for _, pattern := range v.patterns {
+		paths, err := filepath.Glob(pattern)
+		if err != nil {
+			v.log.Err(err.Error())
+		}
+
+		for _, path := range paths {
+			if _, ok := v.tailers[path]; ok {
+				// We are already tailing this file
 				continue
 			}
-			err := lineValid(line.Text)
+
+			t, err := tail.TailFile(path, tail.Config{
+				ReOpen:        true,
+				MustExist:     false, // sometimes files won't exist, so we must tolerate that
+				Follow:        true,
+				Logger:        tailLogger{v.log},
+				CompleteLines: true,
+			})
 			if err != nil {
-				if errors.Is(err, errInvalidChecksum) {
-					v.lineCounter.WithLabelValues(t.Filename, "invalid checksum length").Inc()
-				} else {
-					v.lineCounter.WithLabelValues(t.Filename, "bad").Inc()
-				}
-				select {
-				case <-outputLimiter.C:
-					v.log.Errf("%s: %s %q", t.Filename, err, line.Text)
-				default:
-				}
-			} else {
-				v.lineCounter.WithLabelValues(t.Filename, "ok").Inc()
+				// TailFile shouldn't error when MustExist is false
+				v.log.Errf("unexpected error from TailFile: %v", err)
 			}
+
+			go v.tailValidate(path, t.Lines)
+
+			v.tailers[path] = t
 		}
-	}()
+	}
+}

-	v.tailers = append(v.tailers, t)
+// Monitor calls v.pollPaths every minute until its context is cancelled
+func (v *Validator) monitor(ctx context.Context) {
+	for {
+		v.pollPaths()

-	return nil
+		// Wait a minute, unless cancelled
+		timer := time.NewTimer(time.Minute)
+		select {
+		case <-ctx.Done():
+			return
+		case <-timer.C:
+		}
+	}
+}
+
+func (v *Validator) tailValidate(filename string, lines chan *tail.Line) {
+	// Emit no more than 1 error line per second. This prevents consuming large
+	// amounts of disk space in case there is problem that causes all log lines to
+	// be invalid.
+	outputLimiter := time.NewTicker(time.Second)
+	defer outputLimiter.Stop()
+
+	for line := range lines {
+		if line.Err != nil {
+			v.log.Errf("error while tailing %s: %s", filename, line.Err)
+			continue
+		}
+		err := lineValid(line.Text)
+		if err != nil {
+			if errors.Is(err, errInvalidChecksum) {
+				v.lineCounter.WithLabelValues(filename, "invalid checksum length").Inc()
+			} else {
+				v.lineCounter.WithLabelValues(filename, "bad").Inc()
+			}
+			select {
+			case <-outputLimiter.C:
+				v.log.Errf("%s: %s %q", filename, err, line.Text)
+			default:
+			}
+		} else {
+			v.lineCounter.WithLabelValues(filename, "ok").Inc()
+		}
+	}
 }

 // Shutdown should be called before process shutdown
 func (v *Validator) Shutdown() {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+
+	v.monitorCancel()
+
 	for _, t := range v.tailers {
 		// The tail module seems to have a race condition that will generate
 		// errors like this on shutdown:
--- a/test/config-next/log-validator.json
+++ b/test/config-next/log-validator.json
@ -10,16 +10,8 @@
 	"files": [
 		"/var/log/akamai-purger.log",
 		"/var/log/bad-key-revoker.log",
-		"/var/log/boulder-ca.log",
-		"/var/log/boulder-observer.log",
-		"/var/log/boulder-publisher.log",
-		"/var/log/boulder-ra.log",
-		"/var/log/boulder-remoteva.log",
-		"/var/log/boulder-sa.log",
-		"/var/log/boulder-va.log",
-		"/var/log/boulder-wfe2.log",
-		"/var/log/crl-storer.log",
-		"/var/log/crl-updater.log",
+		"/var/log/boulder-*.log",
+		"/var/log/crl-*.log",
 		"/var/log/nonce-service.log",
 		"/var/log/ocsp-responder.log"
 	]