boulder/log/validator/validator.go

236 lines
6.2 KiB
Go

package validator
import (
"context"
"encoding/base64"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/nxadm/tail"
"github.com/prometheus/client_golang/prometheus"
"github.com/letsencrypt/boulder/log"
)
var errInvalidChecksum = errors.New("invalid checksum length")
type Validator struct {
// mu guards patterns and tailers to prevent Shutdown racing monitor
mu sync.Mutex
// patterns is the list of glob patterns to monitor with filepath.Glob for logs
patterns []string
// tailers is a map of filenames to the tailer which are currently being tailed
tailers map[string]*tail.Tail
// monitorCancel cancels the monitor's context, so it exits
monitorCancel context.CancelFunc
lineCounter *prometheus.CounterVec
log log.Logger
}
// New Validator monitoring paths, which is a list of file globs.
func New(patterns []string, logger log.Logger, stats prometheus.Registerer) *Validator {
lineCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "log_lines",
Help: "A counter of log lines processed, with status",
}, []string{"filename", "status"})
stats.MustRegister(lineCounter)
monitorContext, monitorCancel := context.WithCancel(context.Background())
v := &Validator{
patterns: patterns,
tailers: map[string]*tail.Tail{},
log: logger,
monitorCancel: monitorCancel,
lineCounter: lineCounter,
}
go v.monitor(monitorContext)
return v
}
// pollPaths expands v.patterns and calls v.tailValidateFile on each resulting file
func (v *Validator) pollPaths() {
v.mu.Lock()
defer v.mu.Unlock()
for _, pattern := range v.patterns {
paths, err := filepath.Glob(pattern)
if err != nil {
v.log.Err(err.Error())
}
for _, path := range paths {
if _, ok := v.tailers[path]; ok {
// We are already tailing this file
continue
}
t, err := tail.TailFile(path, tail.Config{
ReOpen: true,
MustExist: false, // sometimes files won't exist, so we must tolerate that
Follow: true,
Logger: tailLogger{v.log},
CompleteLines: true,
})
if err != nil {
// TailFile shouldn't error when MustExist is false
v.log.Errf("unexpected error from TailFile: %v", err)
}
go v.tailValidate(path, t.Lines)
v.tailers[path] = t
}
}
}
// Monitor calls v.pollPaths every minute until its context is cancelled
func (v *Validator) monitor(ctx context.Context) {
for {
v.pollPaths()
// Wait a minute, unless cancelled
timer := time.NewTimer(time.Minute)
select {
case <-ctx.Done():
return
case <-timer.C:
}
}
}
func (v *Validator) tailValidate(filename string, lines chan *tail.Line) {
// Emit no more than 1 error line per second. This prevents consuming large
// amounts of disk space in case there is problem that causes all log lines to
// be invalid.
outputLimiter := time.NewTicker(time.Second)
defer outputLimiter.Stop()
for line := range lines {
if line.Err != nil {
v.log.Errf("error while tailing %s: %s", filename, line.Err)
continue
}
err := lineValid(line.Text)
if err != nil {
if errors.Is(err, errInvalidChecksum) {
v.lineCounter.WithLabelValues(filename, "invalid checksum length").Inc()
} else {
v.lineCounter.WithLabelValues(filename, "bad").Inc()
}
select {
case <-outputLimiter.C:
v.log.Errf("%s: %s %q", filename, err, line.Text)
default:
}
} else {
v.lineCounter.WithLabelValues(filename, "ok").Inc()
}
}
}
// Shutdown should be called before process shutdown
func (v *Validator) Shutdown() {
v.mu.Lock()
defer v.mu.Unlock()
v.monitorCancel()
for _, t := range v.tailers {
// The tail module seems to have a race condition that will generate
// errors like this on shutdown:
// failed to stop tailing file: <filename>: Failed to detect creation of
// <filename>: inotify watcher has been closed
// This is probably related to the module's shutdown logic triggering the
// "reopen" code path for files that are removed and then recreated.
// These errors are harmless so we ignore them to allow clean shutdown.
_ = t.Stop()
t.Cleanup()
}
}
func lineValid(text string) error {
// Line format should match the following rsyslog omfile template:
//
// template( name="LELogFormat" type="list" ) {
// property(name="timereported" dateFormat="rfc3339")
// constant(value=" ")
// property(name="hostname" field.delimiter="46" field.number="1")
// constant(value=" datacenter ")
// property(name="syslogseverity")
// constant(value=" ")
// property(name="syslogtag")
// property(name="msg" spifno1stsp="on" )
// property(name="msg" droplastlf="on" )
// constant(value="\n")
// }
//
// This should result in a log line that looks like this:
// timestamp hostname datacenter syslogseverity binary-name[pid]: checksum msg
fields := strings.Split(text, " ")
const errorPrefix = "log-validator:"
// Extract checksum from line
if len(fields) < 6 {
return fmt.Errorf("%s line doesn't match expected format", errorPrefix)
}
checksum := fields[5]
_, err := base64.RawURLEncoding.DecodeString(checksum)
if err != nil || len(checksum) != 7 {
return fmt.Errorf(
"%s expected a 7 character base64 raw URL decodable string, got %q: %w",
errorPrefix,
checksum,
errInvalidChecksum,
)
}
// Reconstruct just the message portion of the line
line := strings.Join(fields[6:], " ")
// If we are fed our own output, treat it as always valid. This
// prevents runaway scenarios where we generate ever-longer output.
if strings.Contains(text, errorPrefix) {
return nil
}
// Check the extracted checksum against the computed checksum
if computedChecksum := log.LogLineChecksum(line); checksum != computedChecksum {
return fmt.Errorf("%s invalid checksum (expected %q, got %q)", errorPrefix, computedChecksum, checksum)
}
return nil
}
// ValidateFile validates a single file and returns
func ValidateFile(filename string) error {
file, err := os.ReadFile(filename)
if err != nil {
return err
}
badFile := false
for i, line := range strings.Split(string(file), "\n") {
if line == "" {
continue
}
err := lineValid(line)
if err != nil {
badFile = true
fmt.Fprintf(os.Stderr, "[line %d] %s: %s\n", i+1, err, line)
}
}
if badFile {
return errors.New("file contained invalid lines")
}
return nil
}