352 lines
11 KiB
Go
352 lines
11 KiB
Go
// Copyright The OpenTelemetry Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package memorylimiterprocessor // import "go.opentelemetry.io/collector/processor/memorylimiterprocessor"
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"runtime"
|
|
"sync"
|
|
"time"
|
|
|
|
"go.uber.org/atomic"
|
|
"go.uber.org/zap"
|
|
|
|
"go.opentelemetry.io/collector/component"
|
|
"go.opentelemetry.io/collector/internal/iruntime"
|
|
"go.opentelemetry.io/collector/obsreport"
|
|
"go.opentelemetry.io/collector/pdata/plog"
|
|
"go.opentelemetry.io/collector/pdata/pmetric"
|
|
"go.opentelemetry.io/collector/pdata/ptrace"
|
|
)
|
|
|
|
const (
|
|
mibBytes = 1024 * 1024
|
|
)
|
|
|
|
var (
|
|
// errForcedDrop will be returned to callers of ConsumeTraceData to indicate
|
|
// that data is being dropped due to high memory usage.
|
|
errForcedDrop = errors.New("data dropped due to high memory usage")
|
|
|
|
// Construction errors
|
|
|
|
errCheckIntervalOutOfRange = errors.New(
|
|
"checkInterval must be greater than zero")
|
|
|
|
errLimitOutOfRange = errors.New(
|
|
"memAllocLimit or memoryLimitPercentage must be greater than zero")
|
|
|
|
errMemSpikeLimitOutOfRange = errors.New(
|
|
"memSpikeLimit must be smaller than memAllocLimit")
|
|
|
|
errPercentageLimitOutOfRange = errors.New(
|
|
"memoryLimitPercentage and memorySpikePercentage must be greater than zero and less than or equal to hundred",
|
|
)
|
|
|
|
errShutdownNotStarted = errors.New("no existing monitoring routine is running")
|
|
)
|
|
|
|
// make it overridable by tests
|
|
var getMemoryFn = iruntime.TotalMemory
|
|
|
|
type memoryLimiter struct {
|
|
usageChecker memUsageChecker
|
|
|
|
memCheckWait time.Duration
|
|
ballastSize uint64
|
|
|
|
// forceDrop is used atomically to indicate when data should be dropped.
|
|
forceDrop *atomic.Bool
|
|
|
|
ticker *time.Ticker
|
|
|
|
lastGCDone time.Time
|
|
|
|
// The function to read the mem values is set as a reference to help with
|
|
// testing different values.
|
|
readMemStatsFn func(m *runtime.MemStats)
|
|
|
|
// Fields used for logging.
|
|
logger *zap.Logger
|
|
configMismatchedLogged bool
|
|
|
|
obsrep *obsreport.Processor
|
|
|
|
refCounterLock sync.Mutex
|
|
refCounter int
|
|
}
|
|
|
|
// Minimum interval between forced GC when in soft limited mode. We don't want to
|
|
// do GCs too frequently since it is a CPU-heavy operation.
|
|
const minGCIntervalWhenSoftLimited = 10 * time.Second
|
|
|
|
// newMemoryLimiter returns a new memorylimiter processor.
|
|
func newMemoryLimiter(set component.ProcessorCreateSettings, cfg *Config) (*memoryLimiter, error) {
|
|
if cfg.CheckInterval <= 0 {
|
|
return nil, errCheckIntervalOutOfRange
|
|
}
|
|
if cfg.MemoryLimitMiB == 0 && cfg.MemoryLimitPercentage == 0 {
|
|
return nil, errLimitOutOfRange
|
|
}
|
|
|
|
logger := set.Logger
|
|
usageChecker, err := getMemUsageChecker(cfg, logger)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
logger.Info("Memory limiter configured",
|
|
zap.Uint64("limit_mib", usageChecker.memAllocLimit/mibBytes),
|
|
zap.Uint64("spike_limit_mib", usageChecker.memSpikeLimit/mibBytes),
|
|
zap.Duration("check_interval", cfg.CheckInterval))
|
|
|
|
obsrep, err := obsreport.NewProcessor(obsreport.ProcessorSettings{
|
|
ProcessorID: set.ID,
|
|
ProcessorCreateSettings: set,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ml := &memoryLimiter{
|
|
usageChecker: *usageChecker,
|
|
memCheckWait: cfg.CheckInterval,
|
|
ticker: time.NewTicker(cfg.CheckInterval),
|
|
readMemStatsFn: runtime.ReadMemStats,
|
|
logger: logger,
|
|
forceDrop: atomic.NewBool(false),
|
|
obsrep: obsrep,
|
|
}
|
|
|
|
return ml, nil
|
|
}
|
|
|
|
func getMemUsageChecker(cfg *Config, logger *zap.Logger) (*memUsageChecker, error) {
|
|
memAllocLimit := uint64(cfg.MemoryLimitMiB) * mibBytes
|
|
memSpikeLimit := uint64(cfg.MemorySpikeLimitMiB) * mibBytes
|
|
if cfg.MemoryLimitMiB != 0 {
|
|
return newFixedMemUsageChecker(memAllocLimit, memSpikeLimit)
|
|
}
|
|
totalMemory, err := getMemoryFn()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get total memory, use fixed memory settings (limit_mib): %w", err)
|
|
}
|
|
logger.Info("Using percentage memory limiter",
|
|
zap.Uint64("total_memory_mib", totalMemory/mibBytes),
|
|
zap.Uint32("limit_percentage", cfg.MemoryLimitPercentage),
|
|
zap.Uint32("spike_limit_percentage", cfg.MemorySpikePercentage))
|
|
return newPercentageMemUsageChecker(totalMemory, uint64(cfg.MemoryLimitPercentage), uint64(cfg.MemorySpikePercentage))
|
|
}
|
|
|
|
func (ml *memoryLimiter) start(_ context.Context, host component.Host) error {
|
|
extensions := host.GetExtensions()
|
|
for _, extension := range extensions {
|
|
if ext, ok := extension.(interface{ GetBallastSize() uint64 }); ok {
|
|
ml.ballastSize = ext.GetBallastSize()
|
|
break
|
|
}
|
|
}
|
|
ml.startMonitoring()
|
|
return nil
|
|
}
|
|
|
|
func (ml *memoryLimiter) shutdown(context.Context) error {
|
|
ml.refCounterLock.Lock()
|
|
defer ml.refCounterLock.Unlock()
|
|
|
|
if ml.refCounter == 0 {
|
|
return errShutdownNotStarted
|
|
} else if ml.refCounter == 1 {
|
|
ml.ticker.Stop()
|
|
}
|
|
ml.refCounter--
|
|
return nil
|
|
}
|
|
|
|
func (ml *memoryLimiter) processTraces(ctx context.Context, td ptrace.Traces) (ptrace.Traces, error) {
|
|
numSpans := td.SpanCount()
|
|
if ml.forceDrop.Load() {
|
|
// TODO: actually to be 100% sure that this is "refused" and not "dropped"
|
|
// it is necessary to check the pipeline to see if this is directly connected
|
|
// to a receiver (ie.: a receiver is on the call stack). For now it
|
|
// assumes that the pipeline is properly configured and a receiver is on the
|
|
// callstack.
|
|
ml.obsrep.TracesRefused(ctx, numSpans)
|
|
|
|
return td, errForcedDrop
|
|
}
|
|
|
|
// Even if the next consumer returns error record the data as accepted by
|
|
// this processor.
|
|
ml.obsrep.TracesAccepted(ctx, numSpans)
|
|
return td, nil
|
|
}
|
|
|
|
func (ml *memoryLimiter) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) {
|
|
numDataPoints := md.DataPointCount()
|
|
if ml.forceDrop.Load() {
|
|
// TODO: actually to be 100% sure that this is "refused" and not "dropped"
|
|
// it is necessary to check the pipeline to see if this is directly connected
|
|
// to a receiver (ie.: a receiver is on the call stack). For now it
|
|
// assumes that the pipeline is properly configured and a receiver is on the
|
|
// callstack.
|
|
ml.obsrep.MetricsRefused(ctx, numDataPoints)
|
|
return md, errForcedDrop
|
|
}
|
|
|
|
// Even if the next consumer returns error record the data as accepted by
|
|
// this processor.
|
|
ml.obsrep.MetricsAccepted(ctx, numDataPoints)
|
|
return md, nil
|
|
}
|
|
|
|
func (ml *memoryLimiter) processLogs(ctx context.Context, ld plog.Logs) (plog.Logs, error) {
|
|
numRecords := ld.LogRecordCount()
|
|
if ml.forceDrop.Load() {
|
|
// TODO: actually to be 100% sure that this is "refused" and not "dropped"
|
|
// it is necessary to check the pipeline to see if this is directly connected
|
|
// to a receiver (ie.: a receiver is on the call stack). For now it
|
|
// assumes that the pipeline is properly configured and a receiver is on the
|
|
// callstack.
|
|
ml.obsrep.LogsRefused(ctx, numRecords)
|
|
|
|
return ld, errForcedDrop
|
|
}
|
|
|
|
// Even if the next consumer returns error record the data as accepted by
|
|
// this processor.
|
|
ml.obsrep.LogsAccepted(ctx, numRecords)
|
|
return ld, nil
|
|
}
|
|
|
|
func (ml *memoryLimiter) readMemStats() *runtime.MemStats {
|
|
ms := &runtime.MemStats{}
|
|
ml.readMemStatsFn(ms)
|
|
// If proper configured ms.Alloc should be at least ml.ballastSize but since
|
|
// a misconfiguration is possible check for that here.
|
|
if ms.Alloc >= ml.ballastSize {
|
|
ms.Alloc -= ml.ballastSize
|
|
} else if !ml.configMismatchedLogged {
|
|
// This indicates misconfiguration. Log it once.
|
|
ml.configMismatchedLogged = true
|
|
ml.logger.Warn(`"size_mib" in ballast extension is likely incorrectly configured.`)
|
|
}
|
|
|
|
return ms
|
|
}
|
|
|
|
// startMonitoring starts a single ticker'd goroutine per instance
|
|
// that will check memory usage every checkInterval period.
|
|
func (ml *memoryLimiter) startMonitoring() {
|
|
ml.refCounterLock.Lock()
|
|
defer ml.refCounterLock.Unlock()
|
|
|
|
ml.refCounter++
|
|
if ml.refCounter == 1 {
|
|
go func() {
|
|
for range ml.ticker.C {
|
|
ml.checkMemLimits()
|
|
}
|
|
}()
|
|
}
|
|
}
|
|
|
|
func memstatToZapField(ms *runtime.MemStats) zap.Field {
|
|
return zap.Uint64("cur_mem_mib", ms.Alloc/mibBytes)
|
|
}
|
|
|
|
func (ml *memoryLimiter) doGCandReadMemStats() *runtime.MemStats {
|
|
runtime.GC()
|
|
ml.lastGCDone = time.Now()
|
|
ms := ml.readMemStats()
|
|
ml.logger.Info("Memory usage after GC.", memstatToZapField(ms))
|
|
return ms
|
|
}
|
|
|
|
func (ml *memoryLimiter) checkMemLimits() {
|
|
ms := ml.readMemStats()
|
|
|
|
ml.logger.Debug("Currently used memory.", memstatToZapField(ms))
|
|
|
|
if ml.usageChecker.aboveHardLimit(ms) {
|
|
ml.logger.Warn("Memory usage is above hard limit. Forcing a GC.", memstatToZapField(ms))
|
|
ms = ml.doGCandReadMemStats()
|
|
}
|
|
|
|
// Remember current dropping state.
|
|
wasForcingDrop := ml.forceDrop.Load()
|
|
|
|
// Check if the memory usage is above the soft limit.
|
|
mustForceDrop := ml.usageChecker.aboveSoftLimit(ms)
|
|
|
|
if wasForcingDrop && !mustForceDrop {
|
|
// Was previously dropping but enough memory is available now, no need to limit.
|
|
ml.logger.Info("Memory usage back within limits. Resuming normal operation.", memstatToZapField(ms))
|
|
}
|
|
|
|
if !wasForcingDrop && mustForceDrop {
|
|
// We are above soft limit, do a GC if it wasn't done recently and see if
|
|
// it brings memory usage below the soft limit.
|
|
if time.Since(ml.lastGCDone) > minGCIntervalWhenSoftLimited {
|
|
ml.logger.Info("Memory usage is above soft limit. Forcing a GC.", memstatToZapField(ms))
|
|
ms = ml.doGCandReadMemStats()
|
|
// Check the limit again to see if GC helped.
|
|
mustForceDrop = ml.usageChecker.aboveSoftLimit(ms)
|
|
}
|
|
|
|
if mustForceDrop {
|
|
ml.logger.Warn("Memory usage is above soft limit. Dropping data.", memstatToZapField(ms))
|
|
}
|
|
}
|
|
|
|
ml.forceDrop.Store(mustForceDrop)
|
|
}
|
|
|
|
type memUsageChecker struct {
|
|
memAllocLimit uint64
|
|
memSpikeLimit uint64
|
|
}
|
|
|
|
func (d memUsageChecker) aboveSoftLimit(ms *runtime.MemStats) bool {
|
|
return ms.Alloc >= d.memAllocLimit-d.memSpikeLimit
|
|
}
|
|
|
|
func (d memUsageChecker) aboveHardLimit(ms *runtime.MemStats) bool {
|
|
return ms.Alloc >= d.memAllocLimit
|
|
}
|
|
|
|
func newFixedMemUsageChecker(memAllocLimit, memSpikeLimit uint64) (*memUsageChecker, error) {
|
|
if memSpikeLimit >= memAllocLimit {
|
|
return nil, errMemSpikeLimitOutOfRange
|
|
}
|
|
if memSpikeLimit == 0 {
|
|
// If spike limit is unspecified use 20% of mem limit.
|
|
memSpikeLimit = memAllocLimit / 5
|
|
}
|
|
return &memUsageChecker{
|
|
memAllocLimit: memAllocLimit,
|
|
memSpikeLimit: memSpikeLimit,
|
|
}, nil
|
|
}
|
|
|
|
func newPercentageMemUsageChecker(totalMemory uint64, percentageLimit, percentageSpike uint64) (*memUsageChecker, error) {
|
|
if percentageLimit > 100 || percentageLimit <= 0 || percentageSpike > 100 || percentageSpike <= 0 {
|
|
return nil, errPercentageLimitOutOfRange
|
|
}
|
|
return newFixedMemUsageChecker(percentageLimit*totalMemory/100, percentageSpike*totalMemory/100)
|
|
}
|