karmada/pkg/estimator/server/server.go

335 lines
13 KiB
Go

package server
import (
"context"
"fmt"
"net"
"sync/atomic"
"time"
"github.com/kr/pretty"
"google.golang.org/grpc"
"google.golang.org/grpc/metadata"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/discovery"
cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/informers"
infov1 "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
listv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/restmapper"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"
"github.com/karmada-io/karmada/cmd/scheduler-estimator/app/options"
"github.com/karmada-io/karmada/pkg/estimator/pb"
"github.com/karmada-io/karmada/pkg/estimator/server/metrics"
nodeutil "github.com/karmada-io/karmada/pkg/estimator/server/nodes"
"github.com/karmada-io/karmada/pkg/estimator/server/replica"
estimatorservice "github.com/karmada-io/karmada/pkg/estimator/service"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/fedinformer"
"github.com/karmada-io/karmada/pkg/util/fedinformer/genericmanager"
"github.com/karmada-io/karmada/pkg/util/fedinformer/keys"
"github.com/karmada-io/karmada/pkg/util/helper"
"github.com/karmada-io/karmada/pkg/util/lifted"
)
const (
nodeNameKeyIndex = "spec.nodeName"
)
var (
// TODO(Garrybest): make it as an option
supportedGVRs = []schema.GroupVersionResource{
appsv1.SchemeGroupVersion.WithResource("deployments"),
}
)
// AccurateSchedulerEstimatorServer is the gRPC server of a cluster accurate scheduler estimator.
// Please see https://github.com/karmada-io/karmada/pull/580 (#580).
type AccurateSchedulerEstimatorServer struct {
port int
clusterName string
kubeClient kubernetes.Interface
restMapper meta.RESTMapper
informerFactory informers.SharedInformerFactory
nodeInformer infov1.NodeInformer
podInformer infov1.PodInformer
nodeLister listv1.NodeLister
replicaLister *replica.ListerWrapper
getPodFunc func(nodeName string) ([]*corev1.Pod, error)
informerManager genericmanager.SingleClusterInformerManager
parallelizer lifted.Parallelizer
}
// NewEstimatorServer creates an instance of AccurateSchedulerEstimatorServer.
func NewEstimatorServer(
kubeClient kubernetes.Interface,
dynamicClient dynamic.Interface,
discoveryClient discovery.DiscoveryInterface,
opts *options.Options,
stopChan <-chan struct{},
) *AccurateSchedulerEstimatorServer {
cachedDiscoClient := cacheddiscovery.NewMemCacheClient(discoveryClient)
restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cachedDiscoClient)
informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
es := &AccurateSchedulerEstimatorServer{
port: opts.ServerPort,
clusterName: opts.ClusterName,
kubeClient: kubeClient,
restMapper: restMapper,
informerFactory: informerFactory,
nodeInformer: informerFactory.Core().V1().Nodes(),
podInformer: informerFactory.Core().V1().Pods(),
nodeLister: informerFactory.Core().V1().Nodes().Lister(),
replicaLister: &replica.ListerWrapper{
PodLister: informerFactory.Core().V1().Pods().Lister(),
ReplicaSetLister: informerFactory.Apps().V1().ReplicaSets().Lister(),
},
parallelizer: lifted.NewParallelizer(opts.Parallelism),
}
// ignore the error here because the informers haven't been started
_ = es.nodeInformer.Informer().SetTransform(fedinformer.StripUnusedFields)
_ = es.podInformer.Informer().SetTransform(fedinformer.StripUnusedFields)
_ = informerFactory.Apps().V1().ReplicaSets().Informer().SetTransform(fedinformer.StripUnusedFields)
// Establish a connection between the pods and their assigned nodes.
_ = es.podInformer.Informer().AddIndexers(cache.Indexers{
nodeNameKeyIndex: func(obj interface{}) ([]string, error) {
pod, ok := obj.(*corev1.Pod)
if !ok {
return []string{}, nil
}
if len(pod.Spec.NodeName) == 0 {
return []string{}, nil
}
return []string{pod.Spec.NodeName}, nil
},
})
// The indexer helps us get all the pods that assigned to a node.
podIndexer := es.podInformer.Informer().GetIndexer()
es.getPodFunc = func(nodeName string) ([]*corev1.Pod, error) {
objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName)
if err != nil {
return nil, err
}
pods := make([]*corev1.Pod, 0, len(objs))
for _, obj := range objs {
pod, ok := obj.(*corev1.Pod)
if !ok {
continue
}
// Succeeded and failed pods are not considered because they don't occupy any resource.
// See https://github.com/kubernetes/kubernetes/blob/f61ed439882e34d9dad28b602afdc852feb2337a/pkg/scheduler/scheduler.go#L756-L763
if pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed {
pods = append(pods, pod)
}
}
return pods, nil
}
es.informerManager = genericmanager.NewSingleClusterInformerManager(dynamicClient, 0, stopChan)
for _, gvr := range supportedGVRs {
es.informerManager.Lister(gvr)
}
return es
}
// Start runs the accurate replica estimator server.
func (es *AccurateSchedulerEstimatorServer) Start(ctx context.Context) error {
stopCh := ctx.Done()
klog.Infof("Starting karmada cluster(%s) accurate scheduler estimator", es.clusterName)
defer klog.Infof("Shutting down cluster(%s) accurate scheduler estimator", es.clusterName)
es.informerFactory.Start(stopCh)
if !es.waitForCacheSync(stopCh) {
return fmt.Errorf("failed to wait for cache sync")
}
es.informerManager.Start()
synced := es.informerManager.WaitForCacheSync()
if synced == nil {
return fmt.Errorf("informer factory for cluster does not exist")
}
// Listen a port and register the gRPC server.
l, err := net.Listen("tcp", fmt.Sprintf(":%d", es.port))
if err != nil {
return fmt.Errorf("failed to listen port %d: %v", es.port, err)
}
klog.Infof("Listening port: %d", es.port)
defer l.Close()
s := grpc.NewServer()
estimatorservice.RegisterEstimatorServer(s, es)
// Graceful stop when the context is cancelled.
go func() {
<-stopCh
s.GracefulStop()
}()
// Start the gRPC server.
if err := s.Serve(l); err != nil {
return err
}
// Should never reach here.
return nil
}
// MaxAvailableReplicas is the implementation of gRPC interface. It will return the
// max available replicas that a cluster could accommodate based on its requirements.
func (es *AccurateSchedulerEstimatorServer) MaxAvailableReplicas(ctx context.Context, request *pb.MaxAvailableReplicasRequest) (response *pb.MaxAvailableReplicasResponse, rerr error) {
md, ok := metadata.FromIncomingContext(ctx)
if !ok {
klog.Warningf("No metadata from context.")
}
var object string
if m := md.Get(string(util.ContextKeyObject)); len(m) != 0 {
object = m[0]
}
defer traceMaxAvailableReplicas(object, time.Now(), request)(&response, &rerr)
if request.Cluster != es.clusterName {
return nil, fmt.Errorf("cluster name does not match, got: %s, desire: %s", request.Cluster, es.clusterName)
}
// Step 1: Get all matched nodes by node claim
startTime := time.Now()
ncw, err := nodeutil.NewNodeClaimWrapper(request.ReplicaRequirements.NodeClaim)
if err != nil {
return nil, fmt.Errorf("failed to new node claim wrapper: %v", err)
}
nodes, err := ncw.ListNodesByLabelSelector(es.nodeLister)
if err != nil {
return nil, fmt.Errorf("failed to list matched nodes by label selector: %v", err)
}
metrics.UpdateEstimatingAlgorithmLatency(err, metrics.EstimatingTypeMaxAvailableReplicas, metrics.EstimatingStepListNodesByNodeClaim, startTime)
// Step 2: Calculate cluster max available replicas by filtered nodes concurrently
startTime = time.Now()
maxReplicas := es.maxAvailableReplicas(ctx, ncw, nodes, request.ReplicaRequirements.ResourceRequest)
metrics.UpdateEstimatingAlgorithmLatency(nil, metrics.EstimatingTypeMaxAvailableReplicas, metrics.EstimatingStepMaxAvailableReplicas, startTime)
return &pb.MaxAvailableReplicasResponse{MaxReplicas: maxReplicas}, nil
}
// GetUnschedulableReplicas is the implementation of gRPC interface. It will return the
// unschedulable replicas of a workload.
func (es *AccurateSchedulerEstimatorServer) GetUnschedulableReplicas(ctx context.Context, request *pb.UnschedulableReplicasRequest) (response *pb.UnschedulableReplicasResponse, rerr error) {
md, ok := metadata.FromIncomingContext(ctx)
if !ok {
klog.Warningf("No metadata from context.")
}
var object string
if m := md.Get(string(util.ContextKeyObject)); len(m) != 0 {
object = m[0]
}
defer traceGetUnschedulableReplicas(object, time.Now(), request)(&response, &rerr)
if request.Cluster != es.clusterName {
return nil, fmt.Errorf("cluster name does not match, got: %s, desire: %s", request.Cluster, es.clusterName)
}
// Get the workload.
startTime := time.Now()
gvk := schema.FromAPIVersionAndKind(request.Resource.APIVersion, request.Resource.Kind)
unstructObj, err := helper.GetObjectFromSingleClusterCache(es.restMapper, es.informerManager, &keys.ClusterWideKey{
Group: gvk.Group,
Version: gvk.Version,
Kind: gvk.Kind,
Namespace: request.Resource.Namespace,
Name: request.Resource.Name,
})
metrics.UpdateEstimatingAlgorithmLatency(err, metrics.EstimatingTypeGetUnschedulableReplicas, metrics.EstimatingStepGetObjectFromCache, startTime)
if err != nil {
return nil, err
}
// List all unschedulable replicas.
startTime = time.Now()
unschedulables, err := replica.GetUnschedulablePodsOfWorkload(unstructObj, request.UnschedulableThreshold, es.replicaLister)
metrics.UpdateEstimatingAlgorithmLatency(err, metrics.EstimatingTypeGetUnschedulableReplicas, metrics.EstimatingStepGetUnschedulablePodsOfWorkload, startTime)
if err != nil {
return nil, err
}
return &pb.UnschedulableReplicasResponse{UnschedulableReplicas: unschedulables}, err
}
func (es *AccurateSchedulerEstimatorServer) maxAvailableReplicas(
ctx context.Context,
ncw *nodeutil.NodeClaimWrapper,
nodes []*corev1.Node,
request corev1.ResourceList,
) int32 {
var res int32
processNode := func(i int) {
node := nodes[i]
if node == nil {
return
}
if !ncw.IsNodeMatched(node) {
return
}
maxReplica, err := es.nodeMaxAvailableReplica(node, request)
if err != nil {
klog.Errorf("Error: %v", err)
return
}
atomic.AddInt32(&res, maxReplica)
}
es.parallelizer.Until(ctx, len(nodes), processNode)
return res
}
func (es *AccurateSchedulerEstimatorServer) nodeMaxAvailableReplica(node *corev1.Node, request corev1.ResourceList) (int32, error) {
pods, err := es.getPodFunc(node.Name)
if err != nil {
return 0, fmt.Errorf("failed to get pods that assigned to node %s, err: %v", node.Name, err)
}
maxReplica, err := replica.NodeMaxAvailableReplica(node, pods, request)
if err != nil {
return 0, fmt.Errorf("failed to calculating max replica of node %s, err: %v", node.Name, err)
}
return maxReplica, nil
}
func (es *AccurateSchedulerEstimatorServer) waitForCacheSync(stopCh <-chan struct{}) bool {
return cache.WaitForCacheSync(stopCh, es.podInformer.Informer().HasSynced, es.nodeInformer.Informer().HasSynced)
}
func traceMaxAvailableReplicas(object string, start time.Time, request *pb.MaxAvailableReplicasRequest) func(response **pb.MaxAvailableReplicasResponse, err *error) {
klog.V(4).Infof("Begin calculating cluster available replicas of resource(%s), request: %s", object, pretty.Sprint(*request))
return func(response **pb.MaxAvailableReplicasResponse, err *error) {
metrics.CountRequests(*err, metrics.EstimatingTypeMaxAvailableReplicas)
metrics.UpdateEstimatingAlgorithmLatency(*err, metrics.EstimatingTypeMaxAvailableReplicas, metrics.EstimatingStepTotal, start)
if *err != nil {
klog.Errorf("Failed to calculate cluster available replicas: %v", *err)
return
}
klog.V(2).Infof("Finish calculating cluster available replicas of resource(%s), max replicas: %d, time elapsed: %s", object, (*response).MaxReplicas, time.Since(start))
}
}
func traceGetUnschedulableReplicas(object string, start time.Time, request *pb.UnschedulableReplicasRequest) func(response **pb.UnschedulableReplicasResponse, err *error) {
klog.V(4).Infof("Begin detecting cluster unschedulable replicas of resource(%s), request: %s", object, pretty.Sprint(*request))
return func(response **pb.UnschedulableReplicasResponse, err *error) {
metrics.CountRequests(*err, metrics.EstimatingTypeGetUnschedulableReplicas)
metrics.UpdateEstimatingAlgorithmLatency(*err, metrics.EstimatingTypeGetUnschedulableReplicas, metrics.EstimatingStepTotal, start)
if *err != nil {
klog.Errorf("Failed to detect cluster unschedulable replicas: %v", *err)
return
}
klog.V(2).Infof("Finish detecting cluster unschedulable replicas of resource(%s), unschedulable replicas: %d, time elapsed: %s", object, (*response).UnschedulableReplicas, time.Since(start))
}
}