Merge pull request #7696 from macsko/fix_data_race_while_setting_delta_cluster_state_in_parallel

Fix data race while setting delta cluster state in parallel
This commit is contained in:
Kubernetes Prow Robot 2025-01-15 05:56:33 -08:00 committed by GitHub
commit adda3d40bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 13 additions and 39 deletions

View File

@ -248,24 +248,6 @@ func (data *internalDeltaSnapshotData) addPod(pod *apiv1.Pod, nodeName string) e
return nil
}
func (data *internalDeltaSnapshotData) addPodToNode(pod *apiv1.Pod, ni *schedulerframework.NodeInfo) error {
ni.AddPod(pod)
// Maybe consider deleting from the list in the future. Maybe not.
data.clearCaches()
return nil
}
func (data *internalDeltaSnapshotData) addPodsToNode(pods []*apiv1.Pod, ni *schedulerframework.NodeInfo) error {
for _, pod := range pods {
ni.AddPod(pod)
}
// Maybe consider deleting from the list in the future. Maybe not.
data.clearCaches()
return nil
}
func (data *internalDeltaSnapshotData) removePod(namespace, name, nodeName string) error {
// This always clones node info, even if the pod is actually missing.
// Not sure if we mind, since removing non-existent pod
@ -456,19 +438,17 @@ func (snapshot *DeltaSnapshotStore) AddSchedulerNodeInfo(nodeInfo *schedulerfram
}
// setClusterStatePodsSequential sets the pods in cluster state in a sequential way.
func (snapshot *DeltaSnapshotStore) setClusterStatePodsSequential(nodeInfos []*schedulerframework.NodeInfo, nodeNameToIdx map[string]int, scheduledPods []*apiv1.Pod) error {
func (snapshot *DeltaSnapshotStore) setClusterStatePodsSequential(nodeInfos []*schedulerframework.NodeInfo, nodeNameToIdx map[string]int, scheduledPods []*apiv1.Pod) {
for _, pod := range scheduledPods {
if nodeIdx, ok := nodeNameToIdx[pod.Spec.NodeName]; ok {
if err := snapshot.data.addPodToNode(pod, nodeInfos[nodeIdx]); err != nil {
return err
// Can add pod directly. Cache will be cleared afterwards.
nodeInfos[nodeIdx].AddPod(pod)
}
}
}
return nil
}
// setClusterStatePodsParallelized sets the pods in cluster state in parallel based on snapshot.parallelism value.
func (snapshot *DeltaSnapshotStore) setClusterStatePodsParallelized(nodeInfos []*schedulerframework.NodeInfo, nodeNameToIdx map[string]int, scheduledPods []*apiv1.Pod) error {
func (snapshot *DeltaSnapshotStore) setClusterStatePodsParallelized(nodeInfos []*schedulerframework.NodeInfo, nodeNameToIdx map[string]int, scheduledPods []*apiv1.Pod) {
podsForNode := make([][]*apiv1.Pod, len(nodeInfos))
for _, pod := range scheduledPods {
nodeIdx, ok := nodeNameToIdx[pod.Spec.NodeName]
@ -479,16 +459,13 @@ func (snapshot *DeltaSnapshotStore) setClusterStatePodsParallelized(nodeInfos []
}
ctx := context.Background()
ctx, cancel := context.WithCancelCause(ctx)
workqueue.ParallelizeUntil(ctx, snapshot.parallelism, len(nodeInfos), func(nodeIdx int) {
err := snapshot.data.addPodsToNode(podsForNode[nodeIdx], nodeInfos[nodeIdx])
if err != nil {
cancel(err)
nodeInfo := nodeInfos[nodeIdx]
for _, pod := range podsForNode[nodeIdx] {
// Can add pod directly. Cache will be cleared afterwards.
nodeInfo.AddPod(pod)
}
})
return context.Cause(ctx)
}
// SetClusterState sets the cluster state.
@ -507,19 +484,16 @@ func (snapshot *DeltaSnapshotStore) SetClusterState(nodes []*apiv1.Node, schedul
}
if snapshot.parallelism > 1 {
err := snapshot.setClusterStatePodsParallelized(nodeInfos, nodeNameToIdx, scheduledPods)
if err != nil {
return err
}
snapshot.setClusterStatePodsParallelized(nodeInfos, nodeNameToIdx, scheduledPods)
} else {
// TODO(macsko): Migrate to setClusterStatePodsParallelized for parallelism == 1
// after making sure the implementation is always correct in CA 1.33.
err := snapshot.setClusterStatePodsSequential(nodeInfos, nodeNameToIdx, scheduledPods)
if err != nil {
return err
}
snapshot.setClusterStatePodsSequential(nodeInfos, nodeNameToIdx, scheduledPods)
}
// Clear caches after adding pods.
snapshot.data.clearCaches()
// TODO(DRA): Save DRA snapshot.
return nil
}