291 lines
8.8 KiB
Go
291 lines
8.8 KiB
Go
/*
|
|
Copyright 2024 The Karmada Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package coredns
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"time"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/runtime"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
"k8s.io/client-go/informers"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/kubernetes/scheme"
|
|
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/client-go/tools/leaderelection"
|
|
"k8s.io/client-go/tools/leaderelection/resourcelock"
|
|
"k8s.io/client-go/tools/record"
|
|
"k8s.io/client-go/util/workqueue"
|
|
componentbaseconfig "k8s.io/component-base/config"
|
|
"k8s.io/klog/v2"
|
|
|
|
karmada "github.com/karmada-io/karmada/pkg/generated/clientset/versioned"
|
|
"github.com/karmada-io/karmada/pkg/servicenameresolutiondetector/store"
|
|
)
|
|
|
|
const (
|
|
name = "coredns-detector"
|
|
condType = "ServiceDomainNameResolutionReady"
|
|
serviceDomainNameResolutionReady = "ServiceDomainNameResolutionReady"
|
|
serviceDomainNameResolutionFailed = "ServiceDomainNameResolutionFailed"
|
|
)
|
|
|
|
var localReference = &corev1.ObjectReference{
|
|
APIVersion: "v1",
|
|
Kind: "Pod",
|
|
Name: os.Getenv("POD_NAME"),
|
|
Namespace: os.Getenv("POD_NAMESPACE"),
|
|
}
|
|
|
|
// Config contains config of coredns detector.
|
|
type Config struct {
|
|
PeriodSeconds time.Duration
|
|
SuccessThreshold time.Duration
|
|
FailureThreshold time.Duration
|
|
StaleThreshold time.Duration
|
|
}
|
|
|
|
// Detector detects DNS failure and syncs conditions to control plane periodically.
|
|
type Detector struct {
|
|
memberClusterClient kubernetes.Interface
|
|
karmadaClient karmada.Interface
|
|
|
|
lec leaderelection.LeaderElectionConfig
|
|
|
|
periodSeconds time.Duration
|
|
|
|
conditionCache store.ConditionCache
|
|
conditionStore store.ConditionStore
|
|
cacheSynced []cache.InformerSynced
|
|
|
|
nodeName string
|
|
clusterName string
|
|
|
|
queue workqueue.RateLimitingInterface
|
|
eventBroadcaster record.EventBroadcaster
|
|
eventRecorder record.EventRecorder
|
|
}
|
|
|
|
// NewCorednsDetector returns an instance of coredns detector.
|
|
func NewCorednsDetector(memberClusterClient kubernetes.Interface, karmadaClient karmada.Interface, informers informers.SharedInformerFactory,
|
|
baselec componentbaseconfig.LeaderElectionConfiguration, cfg *Config, hostName, clusterName string) (*Detector, error) {
|
|
broadcaster := record.NewBroadcaster()
|
|
recorder := broadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: name})
|
|
|
|
var rl resourcelock.Interface
|
|
var err error
|
|
if baselec.LeaderElect {
|
|
rl, err = resourcelock.New(
|
|
baselec.ResourceLock,
|
|
baselec.ResourceNamespace,
|
|
baselec.ResourceName+"-"+name,
|
|
memberClusterClient.CoreV1(),
|
|
memberClusterClient.CoordinationV1(),
|
|
resourcelock.ResourceLockConfig{Identity: hostName},
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
nodeInformer := informers.Core().V1().Nodes()
|
|
return &Detector{
|
|
memberClusterClient: memberClusterClient,
|
|
karmadaClient: karmadaClient,
|
|
nodeName: hostName,
|
|
clusterName: clusterName,
|
|
periodSeconds: cfg.PeriodSeconds,
|
|
conditionCache: store.NewConditionCache(cfg.SuccessThreshold, cfg.FailureThreshold),
|
|
conditionStore: store.NewNodeConditionStore(memberClusterClient.CoreV1().Nodes(), nodeInformer.Lister(), condType, cfg.StaleThreshold),
|
|
cacheSynced: []cache.InformerSynced{nodeInformer.Informer().HasSynced},
|
|
eventBroadcaster: broadcaster,
|
|
eventRecorder: recorder,
|
|
queue: workqueue.NewRateLimitingQueueWithConfig(workqueue.DefaultControllerRateLimiter(), workqueue.RateLimitingQueueConfig{Name: name}),
|
|
lec: leaderelection.LeaderElectionConfig{
|
|
Lock: rl,
|
|
LeaseDuration: baselec.LeaseDuration.Duration,
|
|
RenewDeadline: baselec.RenewDeadline.Duration,
|
|
RetryPeriod: baselec.RetryPeriod.Duration,
|
|
}}, nil
|
|
}
|
|
|
|
// Run starts the detector.
|
|
func (d *Detector) Run(ctx context.Context) {
|
|
defer runtime.HandleCrash()
|
|
|
|
d.eventBroadcaster.StartStructuredLogging(0)
|
|
d.eventBroadcaster.StartRecordingToSink(&v1.EventSinkImpl{Interface: d.memberClusterClient.CoreV1().Events("")})
|
|
defer d.eventBroadcaster.Shutdown()
|
|
|
|
defer d.queue.ShutDown()
|
|
|
|
logger := klog.FromContext(ctx)
|
|
logger.Info("Starting coredns detector")
|
|
defer logger.Info("Shutting down coredns detector")
|
|
|
|
if !cache.WaitForCacheSync(ctx.Done(), d.cacheSynced...) {
|
|
return
|
|
}
|
|
|
|
go func() {
|
|
wait.Until(func() {
|
|
defer runtime.HandleCrash()
|
|
|
|
observed := lookupOnce(logger)
|
|
curr, err := d.conditionStore.Load(d.nodeName)
|
|
if err != nil {
|
|
d.eventRecorder.Eventf(localReference, corev1.EventTypeWarning, "LoadCorednsConditionFailed", "failed to load condition: %v", err)
|
|
return
|
|
}
|
|
|
|
cond := d.conditionCache.ThresholdAdjustedCondition(d.nodeName, curr, observed)
|
|
if err = d.conditionStore.Store(d.nodeName, cond); err != nil {
|
|
d.eventRecorder.Eventf(localReference, corev1.EventTypeWarning, "StoreCorednsConditionFailed", "failed to store condition: %v", err)
|
|
return
|
|
}
|
|
d.queue.Add(0)
|
|
}, d.periodSeconds, ctx.Done())
|
|
}()
|
|
|
|
if d.lec.Lock != nil {
|
|
d.lec.Callbacks = leaderelection.LeaderCallbacks{
|
|
OnStartedLeading: func(ctx context.Context) {
|
|
wait.UntilWithContext(ctx, d.worker, time.Second)
|
|
},
|
|
OnStoppedLeading: func() {
|
|
logger.Error(nil, "leader election lost")
|
|
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
|
|
},
|
|
}
|
|
leaderelection.RunOrDie(ctx, d.lec)
|
|
} else {
|
|
wait.UntilWithContext(ctx, d.worker, time.Second)
|
|
}
|
|
}
|
|
|
|
func (d *Detector) worker(ctx context.Context) {
|
|
for d.processNextWorkItem(ctx) {
|
|
}
|
|
}
|
|
|
|
func (d *Detector) processNextWorkItem(ctx context.Context) bool {
|
|
key, quit := d.queue.Get()
|
|
if quit {
|
|
return false
|
|
}
|
|
defer d.queue.Done(key)
|
|
|
|
if err := d.sync(ctx); err != nil {
|
|
runtime.HandleError(fmt.Errorf("failed to sync corendns condition to control plane, requeuing: %v", err))
|
|
d.queue.AddRateLimited(key)
|
|
} else {
|
|
d.queue.Forget(key)
|
|
}
|
|
return true
|
|
}
|
|
|
|
func lookupOnce(logger klog.Logger) *metav1.Condition {
|
|
logger.Info("lookup service name once")
|
|
observed := &metav1.Condition{Type: condType, LastTransitionTime: metav1.Now()}
|
|
if _, err := net.LookupHost("kubernetes.default"); err != nil {
|
|
logger.Error(err, "nslookup failed")
|
|
observed.Status = metav1.ConditionFalse
|
|
observed.Reason = serviceDomainNameResolutionFailed
|
|
observed.Message = err.Error()
|
|
} else {
|
|
observed.Status = metav1.ConditionTrue
|
|
}
|
|
return observed
|
|
}
|
|
|
|
func (d *Detector) sync(ctx context.Context) error {
|
|
logger := klog.FromContext(ctx)
|
|
skip, alarm, err := d.shouldAlarm()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if skip {
|
|
logger.Info("skip syncing to control plane since nodes with condition Unknown")
|
|
return nil
|
|
}
|
|
|
|
cond, err := d.newClusterCondition(alarm)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
cluster, err := d.karmadaClient.ClusterV1alpha1().Clusters().Get(ctx, d.clusterName, metav1.GetOptions{})
|
|
if err != nil {
|
|
if apierrors.IsNotFound(err) {
|
|
logger.Info(fmt.Sprintf("cluster %s not found, skip sync coredns condition", d.clusterName))
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
if !cluster.DeletionTimestamp.IsZero() {
|
|
logger.Info(fmt.Sprintf("cluster %s is deleting, skip sync coredns condition", d.clusterName))
|
|
return nil
|
|
}
|
|
meta.SetStatusCondition(&cluster.Status.Conditions, *cond)
|
|
_, err = d.karmadaClient.ClusterV1alpha1().Clusters().UpdateStatus(ctx, cluster, metav1.UpdateOptions{})
|
|
return err
|
|
}
|
|
|
|
func (d *Detector) newClusterCondition(alarm bool) (*metav1.Condition, error) {
|
|
cond := &metav1.Condition{Type: condType}
|
|
if alarm {
|
|
cond.Status = metav1.ConditionFalse
|
|
cond.Reason = serviceDomainNameResolutionFailed
|
|
cond.Message = "service domain name resolution is unready"
|
|
} else {
|
|
cond.Status = metav1.ConditionTrue
|
|
cond.Reason = serviceDomainNameResolutionReady
|
|
cond.Message = "service domain name resolution is ready"
|
|
}
|
|
return cond, nil
|
|
}
|
|
|
|
func (d *Detector) shouldAlarm() (skip bool, alarm bool, err error) {
|
|
conditions, err := d.conditionStore.ListAll()
|
|
if err != nil {
|
|
return false, false, err
|
|
}
|
|
if len(conditions) == 0 {
|
|
return true, false, nil
|
|
}
|
|
|
|
hasUnknown, allFalse := false, true
|
|
for _, cond := range conditions {
|
|
switch cond.Status {
|
|
case metav1.ConditionUnknown:
|
|
hasUnknown = true
|
|
case metav1.ConditionFalse:
|
|
default:
|
|
allFalse = false
|
|
}
|
|
}
|
|
return hasUnknown, allFalse, nil
|
|
}
|