202 lines
6.2 KiB
Go
202 lines
6.2 KiB
Go
// Copyright The OpenTelemetry Authors
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package exporterhelper // import "go.opentelemetry.io/collector/exporter/exporterhelper"
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/cenkalti/backoff/v4"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
"go.uber.org/zap"
|
|
|
|
"go.opentelemetry.io/collector/consumer/consumererror"
|
|
"go.opentelemetry.io/collector/exporter"
|
|
"go.opentelemetry.io/collector/exporter/exporterhelper/internal"
|
|
"go.opentelemetry.io/collector/internal/obsreportconfig/obsmetrics"
|
|
)
|
|
|
|
// RetrySettings defines configuration for retrying batches in case of export failure.
|
|
// The current supported strategy is exponential backoff.
|
|
type RetrySettings struct {
|
|
// Enabled indicates whether to not retry sending batches in case of export failure.
|
|
Enabled bool `mapstructure:"enabled"`
|
|
// InitialInterval the time to wait after the first failure before retrying.
|
|
InitialInterval time.Duration `mapstructure:"initial_interval"`
|
|
// RandomizationFactor is a random factor used to calculate next backoffs
|
|
// Randomized interval = RetryInterval * (1 ± RandomizationFactor)
|
|
RandomizationFactor float64 `mapstructure:"randomization_factor"`
|
|
// Multiplier is the value multiplied by the backoff interval bounds
|
|
Multiplier float64 `mapstructure:"multiplier"`
|
|
// MaxInterval is the upper bound on backoff interval. Once this value is reached the delay between
|
|
// consecutive retries will always be `MaxInterval`.
|
|
MaxInterval time.Duration `mapstructure:"max_interval"`
|
|
// MaxElapsedTime is the maximum amount of time (including retries) spent trying to send a request/batch.
|
|
// Once this value is reached, the data is discarded. If set to 0, the retries are never stopped.
|
|
MaxElapsedTime time.Duration `mapstructure:"max_elapsed_time"`
|
|
}
|
|
|
|
func (cfg *RetrySettings) Validate() error {
|
|
if !cfg.Enabled {
|
|
return nil
|
|
}
|
|
if cfg.InitialInterval < 0 {
|
|
return errors.New("'initial_interval' must be non-negative")
|
|
}
|
|
if cfg.RandomizationFactor < 0 || cfg.RandomizationFactor > 1 {
|
|
return errors.New("'randomization_factor' must be within [0, 1]")
|
|
}
|
|
if cfg.Multiplier <= 0 {
|
|
return errors.New("'multiplier' must be positive")
|
|
}
|
|
if cfg.MaxInterval < 0 {
|
|
return errors.New("'max_interval' must be non-negative")
|
|
}
|
|
if cfg.MaxElapsedTime < 0 {
|
|
return errors.New("'max_elapsed' time must be non-negative")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// NewDefaultRetrySettings returns the default settings for RetrySettings.
|
|
func NewDefaultRetrySettings() RetrySettings {
|
|
return RetrySettings{
|
|
Enabled: true,
|
|
InitialInterval: 5 * time.Second,
|
|
RandomizationFactor: backoff.DefaultRandomizationFactor,
|
|
Multiplier: backoff.DefaultMultiplier,
|
|
MaxInterval: 30 * time.Second,
|
|
MaxElapsedTime: 5 * time.Minute,
|
|
}
|
|
}
|
|
|
|
// TODO: Clean this by forcing all exporters to return an internal error type that always include the information about retries.
|
|
type throttleRetry struct {
|
|
err error
|
|
delay time.Duration
|
|
}
|
|
|
|
func (t throttleRetry) Error() string {
|
|
return "Throttle (" + t.delay.String() + "), error: " + t.err.Error()
|
|
}
|
|
|
|
func (t throttleRetry) Unwrap() error {
|
|
return t.err
|
|
}
|
|
|
|
// NewThrottleRetry creates a new throttle retry error.
|
|
func NewThrottleRetry(err error, delay time.Duration) error {
|
|
return throttleRetry{
|
|
err: err,
|
|
delay: delay,
|
|
}
|
|
}
|
|
|
|
type retrySender struct {
|
|
baseRequestSender
|
|
traceAttribute attribute.KeyValue
|
|
cfg RetrySettings
|
|
stopCh chan struct{}
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func newRetrySender(config RetrySettings, set exporter.CreateSettings) *retrySender {
|
|
return &retrySender{
|
|
traceAttribute: attribute.String(obsmetrics.ExporterKey, set.ID.String()),
|
|
cfg: config,
|
|
stopCh: make(chan struct{}),
|
|
logger: set.Logger,
|
|
}
|
|
}
|
|
|
|
func (rs *retrySender) Shutdown(context.Context) error {
|
|
close(rs.stopCh)
|
|
return nil
|
|
}
|
|
|
|
// send implements the requestSender interface
|
|
func (rs *retrySender) send(ctx context.Context, req Request) error {
|
|
// Do not use NewExponentialBackOff since it calls Reset and the code here must
|
|
// call Reset after changing the InitialInterval (this saves an unnecessary call to Now).
|
|
expBackoff := backoff.ExponentialBackOff{
|
|
InitialInterval: rs.cfg.InitialInterval,
|
|
RandomizationFactor: rs.cfg.RandomizationFactor,
|
|
Multiplier: rs.cfg.Multiplier,
|
|
MaxInterval: rs.cfg.MaxInterval,
|
|
MaxElapsedTime: rs.cfg.MaxElapsedTime,
|
|
Stop: backoff.Stop,
|
|
Clock: backoff.SystemClock,
|
|
}
|
|
expBackoff.Reset()
|
|
span := trace.SpanFromContext(ctx)
|
|
retryNum := int64(0)
|
|
for {
|
|
span.AddEvent(
|
|
"Sending request.",
|
|
trace.WithAttributes(rs.traceAttribute, attribute.Int64("retry_num", retryNum)))
|
|
|
|
err := rs.nextSender.send(ctx, req)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
// Immediately drop data on permanent errors.
|
|
if consumererror.IsPermanent(err) {
|
|
rs.logger.Error(
|
|
"Exporting failed. The error is not retryable. Dropping data.",
|
|
zap.Error(err),
|
|
zap.Int("dropped_items", req.ItemsCount()),
|
|
)
|
|
return err
|
|
}
|
|
|
|
req = extractPartialRequest(req, err)
|
|
|
|
backoffDelay := expBackoff.NextBackOff()
|
|
if backoffDelay == backoff.Stop {
|
|
return fmt.Errorf("max elapsed time expired %w", err)
|
|
}
|
|
|
|
throttleErr := throttleRetry{}
|
|
isThrottle := errors.As(err, &throttleErr)
|
|
if isThrottle {
|
|
backoffDelay = max(backoffDelay, throttleErr.delay)
|
|
}
|
|
|
|
backoffDelayStr := backoffDelay.String()
|
|
span.AddEvent(
|
|
"Exporting failed. Will retry the request after interval.",
|
|
trace.WithAttributes(
|
|
rs.traceAttribute,
|
|
attribute.String("interval", backoffDelayStr),
|
|
attribute.String("error", err.Error())))
|
|
rs.logger.Info(
|
|
"Exporting failed. Will retry the request after interval.",
|
|
zap.Error(err),
|
|
zap.String("interval", backoffDelayStr),
|
|
)
|
|
retryNum++
|
|
|
|
// back-off, but get interrupted when shutting down or request is cancelled or timed out.
|
|
select {
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("request is cancelled or timed out %w", err)
|
|
case <-rs.stopCh:
|
|
return internal.NewShutdownErr(err)
|
|
case <-time.After(backoffDelay):
|
|
}
|
|
}
|
|
}
|
|
|
|
// max returns the larger of x or y.
|
|
func max(x, y time.Duration) time.Duration {
|
|
if x < y {
|
|
return y
|
|
}
|
|
return x
|
|
}
|