Add ScaleExpectation for BroadcastJob

This commit is contained in:
Guo, Fei 2020-11-18 14:23:35 -08:00 committed by Siyu Wang
parent 90bdd3b6d6
commit 68d501d8ca
3 changed files with 144 additions and 58 deletions

View File

@ -33,6 +33,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/tools/record"
@ -48,6 +49,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
"github.com/openkruise/kruise/pkg/util/expectations"
)
func init() {
@ -62,6 +65,7 @@ const (
var (
concurrentReconciles = 3
controllerKind = appsv1alpha1.SchemeGroupVersion.WithKind("BroadcastJob")
scaleExpectations = expectations.NewScaleExpectations()
)
// Add creates a new BroadcastJob Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
@ -99,16 +103,18 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error {
return err
}
err = c.Watch(&source.Kind{Type: &corev1.Pod{}}, &handler.EnqueueRequestForOwner{
IsController: true,
OwnerType: &appsv1alpha1.BroadcastJob{},
// Wathc for changes to Pod
err = c.Watch(&source.Kind{Type: &corev1.Pod{}}, &podEventHandler{
enqueueHandler: handler.EnqueueRequestForOwner{
IsController: true,
OwnerType: &appsv1alpha1.BroadcastJob{},
},
})
if err != nil {
return err
}
// Watch for changes to Pod
// Watch for changes to Node
if err = c.Watch(&source.Kind{Type: &corev1.Node{}}, &enqueueBroadcastJobForNode{client: mgr.GetClient()}); err != nil {
return err
}
@ -143,12 +149,23 @@ func (r *ReconcileBroadcastJob) Reconcile(request reconcile.Request) (reconcile.
if errors.IsNotFound(err) {
// Object not found, return. Created objects are automatically garbage collected.
// For additional cleanup logic use finalizers.
scaleExpectations.DeleteExpectations(request.String())
return reconcile.Result{}, nil
}
// Error reading the object - requeue the request.
klog.Errorf("failed to get job %s,", job.Name)
return reconcile.Result{}, err
}
if scaleSatisfied, unsatisfiedDuration, scaleDirtyPods := scaleExpectations.SatisfiedExpectations(request.String()); !scaleSatisfied {
if unsatisfiedDuration >= expectations.ExpectationTimeout {
klog.Warningf("Expectation unsatisfied overtime for bcj %v, scaleDirtyPods=%v, overtime=%v", request.String(), scaleDirtyPods, unsatisfiedDuration)
return reconcile.Result{}, nil
}
klog.V(4).Infof("Not satisfied scale for bcj %v, scaleDirtyPods=%v", request.String(), scaleDirtyPods)
return reconcile.Result{RequeueAfter: expectations.ExpectationTimeout - unsatisfiedDuration}, nil
}
// Add pre-defined labels to pod template
addLabelToPodTemplate(job)
@ -410,7 +427,7 @@ func (r *ReconcileBroadcastJob) reconcilePods(job *appsv1alpha1.BroadcastJob,
defer wait.Done()
// parallelize pod creation
klog.Infof("creating pod on node %s", nodeName)
err := r.createPodsOnNode(nodeName, job.Namespace, &job.Spec.Template, job, asOwner(job))
err := r.createPodOnNode(nodeName, job.Namespace, &job.Spec.Template, job, asOwner(job))
if err != nil && errors.IsTimeout(err) {
// Pod is created but its initialization has timed out.
// If the initialization is successful eventually, the
@ -622,7 +639,10 @@ func (r *ReconcileBroadcastJob) deleteJobPods(job *appsv1alpha1.BroadcastJob, po
for i := int32(0); i < int32(nbPods); i++ {
go func(ix int32) {
defer wait.Done()
key := types.NamespacedName{Namespace: job.Namespace, Name: job.Name}.String()
scaleExpectations.ExpectScale(key, expectations.Delete, pods[ix].Spec.NodeName)
if err := r.Delete(context.TODO(), pods[ix]); err != nil {
scaleExpectations.ObserveScale(key, expectations.Delete, pods[ix].Spec.NodeName)
defer utilruntime.HandleError(err)
klog.Infof("Failed to delete %v, job %q/%q", pods[ix].Name, job.Namespace, job.Name)
errCh <- err
@ -644,14 +664,14 @@ func (r *ReconcileBroadcastJob) deleteJobPods(job *appsv1alpha1.BroadcastJob, po
return failed, active, manageJobErr
}
func (r *ReconcileBroadcastJob) createPodsOnNode(nodeName, namespace string, template *corev1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
func (r *ReconcileBroadcastJob) createPodOnNode(nodeName, namespace string, template *corev1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
if err := validateControllerRef(controllerRef); err != nil {
return err
}
return r.createPods(nodeName, namespace, template, object, controllerRef)
return r.createPod(nodeName, namespace, template, object, controllerRef)
}
func (r *ReconcileBroadcastJob) createPods(nodeName, namespace string, template *corev1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
func (r *ReconcileBroadcastJob) createPod(nodeName, namespace string, template *corev1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
pod, err := kubecontroller.GetPodFromTemplate(template, object, controllerRef)
if err != nil {
return err
@ -668,7 +688,13 @@ func (r *ReconcileBroadcastJob) createPods(nodeName, namespace string, template
if r.podModifier != nil {
r.podModifier(pod)
}
key := types.NamespacedName{Namespace: namespace, Name: controllerRef.Name}.String()
// Pod.Name is empty since the Pod uses generated name. We use nodeName as the unique identity
// since each node should only contain one job Pod.
scaleExpectations.ExpectScale(key, expectations.Create, nodeName)
if err := r.Client.Create(context.TODO(), pod); err != nil {
scaleExpectations.ObserveScale(key, expectations.Create, nodeName)
r.recorder.Eventf(object, corev1.EventTypeWarning, kubecontroller.FailedCreatePodReason, "Error creating: %v", err)
return err
}

View File

@ -114,10 +114,10 @@ func TestReconcileJobCreatePodPercentage(t *testing.T) {
p := intstr.FromString("40%")
// A job
job1 := createJob("job1", p)
job := createJob("job2", p)
// A POD for job1 running on node1
job1Pod1onNode1 := createPod(job1, "job1pod1node1", "node1", v1.PodRunning)
// A POD for job running on node1
jobPod1onNode1 := createPod(job, "jobpod1node1", "node1", v1.PodRunning)
// Node1 has 1 pod running
node1 := createNode("node1")
@ -130,11 +130,11 @@ func TestReconcileJobCreatePodPercentage(t *testing.T) {
// Node3 does not have pod running
node5 := createNode("node5")
reconcileJob := createReconcileJob(scheme, job1, job1Pod1onNode1, node1, node2, node3, node4, node5)
reconcileJob := createReconcileJob(scheme, job, jobPod1onNode1, node1, node2, node3, node4, node5)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job2",
Namespace: "default",
},
}
@ -155,7 +155,7 @@ func TestReconcileJobCreatePodPercentage(t *testing.T) {
// 1 new pod created, because parallelism is 2,
assert.Equal(t, 2, len(podList.Items))
// The new pod has the job-name label
assert.Equal(t, "job1", podList.Items[0].Labels[JobNameLabelKey])
assert.Equal(t, "job2", podList.Items[0].Labels[JobNameLabelKey])
// 3 desired pods, one for each node
assert.Equal(t, int32(5), retrievedJob.Status.Desired)
assert.NotNil(t, retrievedJob.Status.StartTime)
@ -172,7 +172,7 @@ func TestPodsOnUnschedulableNodes(t *testing.T) {
p := intstr.FromInt(2)
// A job
job1 := createJob("job1", p)
job := createJob("job3", p)
// Create Node1 with Unschedulable to true
node1 := createNode("node1")
@ -181,10 +181,10 @@ func TestPodsOnUnschedulableNodes(t *testing.T) {
// Create node2
node2 := createNode("node2")
reconcileJob := createReconcileJob(scheme, job1, node1, node2)
reconcileJob := createReconcileJob(scheme, job, node1, node2)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job3",
Namespace: "default",
},
}
@ -218,17 +218,17 @@ func TestReconcileJobMultipleBatches(t *testing.T) {
p := intstr.FromInt(20)
// A job
job1 := createJob("job1", p)
job := createJob("job4", p)
var objList []runtime.Object
objList = append(objList, job1)
objList = append(objList, job)
for i := 0; i < 10; i++ {
objList = append(objList, createNode(fmt.Sprintf("node-%d", i)))
}
reconcileJob := createReconcileJob(scheme, objList...)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job4",
Namespace: "default",
},
}
@ -245,7 +245,7 @@ func TestReconcileJobMultipleBatches(t *testing.T) {
podList := &v1.PodList{}
listOptions := &client.ListOptions{
Namespace: request.Namespace,
LabelSelector: labels.SelectorFromSet(labelsAsMap(job1)),
LabelSelector: labels.SelectorFromSet(labelsAsMap(job)),
}
err = reconcileJob.List(context.TODO(), podList, listOptions)
assert.NoError(t, err)
@ -263,8 +263,8 @@ func TestJobFailed(t *testing.T) {
// A job
p := intstr.FromInt(10)
job1 := createJob("job1", p)
job1.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeFailFast
job := createJob("job5", p)
job.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeFailFast
// Create 3 nodes
// Node1 has 1 pod running
@ -275,15 +275,15 @@ func TestJobFailed(t *testing.T) {
node3 := createNode("node3")
// Create 3 pods, 2 succeeded, 1 failed
pod1onNode1 := createPod(job1, "pod1node1", "node1", v1.PodSucceeded)
pod2onNode2 := createPod(job1, "pod2node2", "node2", v1.PodSucceeded)
pod3onNode3 := createPod(job1, "pod3node3", "node3", v1.PodFailed)
pod1onNode1 := createPod(job, "pod1node1", "node1", v1.PodSucceeded)
pod2onNode2 := createPod(job, "pod2node2", "node2", v1.PodSucceeded)
pod3onNode3 := createPod(job, "pod3node3", "node3", v1.PodFailed)
reconcileJob := createReconcileJob(scheme, job1, pod1onNode1, pod2onNode2, pod3onNode3, node1, node2, node3)
reconcileJob := createReconcileJob(scheme, job, pod1onNode1, pod2onNode2, pod3onNode3, node1, node2, node3)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job5",
Namespace: "default",
},
}
@ -315,8 +315,8 @@ func TestJobFailurePolicyTypeContinue(t *testing.T) {
// A job
p := intstr.FromInt(10)
job1 := createJob("job1", p)
job1.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeContinue
job := createJob("job6", p)
job.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeContinue
// Create 3 nodes
// Node1 has 1 pod running
@ -327,14 +327,14 @@ func TestJobFailurePolicyTypeContinue(t *testing.T) {
node3 := createNode("node3")
// Create 3 pods, 2 succeeded, 1 failed
pod1onNode1 := createPod(job1, "pod1node1", "node1", v1.PodSucceeded)
pod3onNode3 := createPod(job1, "pod3node3", "node3", v1.PodFailed)
pod1onNode1 := createPod(job, "pod1node1", "node1", v1.PodSucceeded)
pod3onNode3 := createPod(job, "pod3node3", "node3", v1.PodFailed)
reconcileJob := createReconcileJob(scheme, job1, pod1onNode1, pod3onNode3, node1, node2, node3)
reconcileJob := createReconcileJob(scheme, job, pod1onNode1, pod3onNode3, node1, node2, node3)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job6",
Namespace: "default",
},
}
@ -362,8 +362,8 @@ func TestJobFailurePolicyTypeFailFast(t *testing.T) {
// A job
p := intstr.FromInt(10)
job1 := createJob("job1", p)
job1.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeFailFast
job := createJob("job7", p)
job.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypeFailFast
// Create 3 nodes
// Node1 has 1 pod running
@ -374,14 +374,14 @@ func TestJobFailurePolicyTypeFailFast(t *testing.T) {
node3 := createNode("node3")
// Create 3 pods, 2 succeeded, 1 failed
pod1onNode1 := createPod(job1, "pod1node1", "node1", v1.PodSucceeded)
pod3onNode3 := createPod(job1, "pod3node3", "node3", v1.PodFailed)
pod1onNode1 := createPod(job, "pod1node1", "node1", v1.PodSucceeded)
pod3onNode3 := createPod(job, "pod3node3", "node3", v1.PodFailed)
reconcileJob := createReconcileJob(scheme, job1, pod1onNode1, pod3onNode3, node1, node2, node3)
reconcileJob := createReconcileJob(scheme, job, pod1onNode1, pod3onNode3, node1, node2, node3)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job7",
Namespace: "default",
},
}
@ -409,8 +409,8 @@ func TestJobFailurePolicyPause(t *testing.T) {
// A job
p := intstr.FromInt(10)
job1 := createJob("job1", p)
job1.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypePause
job := createJob("job8", p)
job.Spec.FailurePolicy.Type = appsv1alpha1.FailurePolicyTypePause
// Create 3 nodes
// Node1 has 1 pod running
@ -421,14 +421,14 @@ func TestJobFailurePolicyPause(t *testing.T) {
node3 := createNode("node3")
// Create 2 pods, 1 succeeded, 1 failed
pod1onNode1 := createPod(job1, "pod1node1", "node1", v1.PodSucceeded)
pod2onNode2 := createPod(job1, "pod2node2", "node2", v1.PodFailed)
pod1onNode1 := createPod(job, "pod1node1", "node1", v1.PodSucceeded)
pod2onNode2 := createPod(job, "pod2node2", "node2", v1.PodFailed)
reconcileJob := createReconcileJob(scheme, job1, pod1onNode1, pod2onNode2, node1, node2, node3)
reconcileJob := createReconcileJob(scheme, job, pod1onNode1, pod2onNode2, node1, node2, node3)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job8",
Namespace: "default",
},
}
@ -457,23 +457,23 @@ func TestJobSetPaused(t *testing.T) {
p := intstr.FromString("50%")
// A job
job1 := createJob("job1", p)
job1.Spec.Paused = true
job := createJob("job9", p)
job.Spec.Paused = true
var objList []runtime.Object
objList = append(objList, job1)
objList = append(objList, job)
for i := 0; i < 10; i++ {
objList = append(objList, createNode(fmt.Sprintf("node-%d", i)))
}
// Create 3 succeeded pods
for i := 0; i < 3; i++ {
objList = append(objList, createPod(job1, fmt.Sprintf("pod%dnode%d", i, i), fmt.Sprintf("node%d", i), v1.PodRunning))
objList = append(objList, createPod(job, fmt.Sprintf("pod%dnode%d", i, i), fmt.Sprintf("node%d", i), v1.PodRunning))
}
reconcileJob := createReconcileJob(scheme, objList...)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job9",
Namespace: "default",
},
}
@ -503,9 +503,9 @@ func TestJobFailedAfterActiveDeadline(t *testing.T) {
// activeDeadline is set 0, to make job fail
activeDeadline := int64(0)
now := metav1.Now()
job1 := &appsv1alpha1.BroadcastJob{
job := &appsv1alpha1.BroadcastJob{
ObjectMeta: metav1.ObjectMeta{
Name: "job1",
Name: "job10",
Namespace: "default",
UID: "12345",
SelfLink: "/apis/apps.kruise.io/v1alpha1/namespaces/default/broadcastjobs/test",
@ -528,14 +528,14 @@ func TestJobFailedAfterActiveDeadline(t *testing.T) {
node2 := createNode("node2")
// two POD for job1 running on node1, node2
job1Pod1onNode1 := createPod(job1, "job1pod1node1", "node1", v1.PodRunning)
job1Pod2onNode1 := createPod(job1, "job1pod2node2", "node2", v1.PodRunning)
job1Pod1onNode1 := createPod(job, "job1pod1node1", "node1", v1.PodRunning)
job1Pod2onNode1 := createPod(job, "job1pod2node2", "node2", v1.PodRunning)
reconcileJob := createReconcileJob(scheme, job1, job1Pod1onNode1, job1Pod2onNode1, node1, node2)
reconcileJob := createReconcileJob(scheme, job, job1Pod1onNode1, job1Pod2onNode1, node1, node2)
request := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: "job1",
Name: "job10",
Namespace: "default",
},
}

View File

@ -5,17 +5,77 @@ import (
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/runtime/inject"
"github.com/openkruise/kruise/apis/apps/v1alpha1"
"github.com/openkruise/kruise/pkg/util/expectations"
)
type podEventHandler struct {
enqueueHandler handler.EnqueueRequestForOwner
}
func isBroadcastJobController(controllerRef *metav1.OwnerReference) bool {
refGV, err := schema.ParseGroupVersion(controllerRef.APIVersion)
if err != nil {
klog.Errorf("Could not parse OwnerReference %v APIVersion: %v", controllerRef, err)
return false
}
return controllerRef.Kind == controllerKind.Kind && refGV.Group == controllerKind.Group
}
func (p *podEventHandler) Create(evt event.CreateEvent, q workqueue.RateLimitingInterface) {
pod := evt.Object.(*v1.Pod)
if pod.DeletionTimestamp != nil {
p.Delete(event.DeleteEvent{Meta: evt.Meta, Object: evt.Object}, q)
return
}
if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil && isBroadcastJobController(controllerRef) {
key := types.NamespacedName{Namespace: pod.Namespace, Name: controllerRef.Name}.String()
scaleExpectations.ObserveScale(key, expectations.Create, pod.Spec.NodeName)
p.enqueueHandler.Create(evt, q)
}
}
func (p *podEventHandler) Delete(evt event.DeleteEvent, q workqueue.RateLimitingInterface) {
pod := evt.Object.(*v1.Pod)
if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil && isBroadcastJobController(controllerRef) {
key := types.NamespacedName{Namespace: pod.Namespace, Name: controllerRef.Name}.String()
scaleExpectations.ObserveScale(key, expectations.Delete, pod.Spec.NodeName)
p.enqueueHandler.Delete(evt, q)
}
}
func (p *podEventHandler) Update(evt event.UpdateEvent, q workqueue.RateLimitingInterface) {
p.enqueueHandler.Update(evt, q)
}
func (p *podEventHandler) Generic(evt event.GenericEvent, q workqueue.RateLimitingInterface) {
}
var _ inject.Mapper = &podEventHandler{}
func (p *podEventHandler) InjectScheme(s *runtime.Scheme) error {
return p.enqueueHandler.InjectScheme(s)
}
var _ inject.Mapper = &podEventHandler{}
func (p *podEventHandler) InjectMapper(m meta.RESTMapper) error {
return p.enqueueHandler.InjectMapper(m)
}
type enqueueBroadcastJobForNode struct {
client client.Client
}