Merge pull request #14771 from johngmyers/sqs-rebalance

Don't drain on rebalance recommendations in SQS mode unless configured
This commit is contained in:
Kubernetes Prow Robot 2022-12-17 07:15:42 -08:00 committed by GitHub
commit 0f5fc06cec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 107 additions and 122 deletions

View File

@ -70,7 +70,8 @@ type integrationTest struct {
bastionUserData bool
ciliumEtcd bool
// nth is true if we should check for files created by nth queue processor add on
nth bool
nth bool
nthRebalance bool
}
func newIntegrationTest(clusterName, srcDir string) *integrationTest {
@ -149,6 +150,11 @@ func (i *integrationTest) withNTH() *integrationTest {
return i
}
func (i *integrationTest) withNTHRebalance() *integrationTest {
i.nthRebalance = true
return i
}
func (i *integrationTest) withOIDCDiscovery() *integrationTest {
i.discovery = true
return i
@ -814,6 +820,7 @@ func TestCCM(t *testing.T) {
metricsServerAddon,
).
withNTH().
withNTHRebalance().
runTestTerraformAWS(t)
}
@ -1403,13 +1410,15 @@ func (i *integrationTest) runTestTerraformAWS(t *testing.T) {
expectedFilenames = append(expectedFilenames, []string{
"aws_s3_object_" + i.clusterName + "-addons-node-termination-handler.aws-k8s-1.11_content",
"aws_cloudwatch_event_rule_" + awsup.GetClusterName40(i.clusterName) + "-ASGLifecycle_event_pattern",
"aws_cloudwatch_event_rule_" + awsup.GetClusterName40(i.clusterName) + "-RebalanceRecommendation_event_pattern",
"aws_cloudwatch_event_rule_" + awsup.GetClusterName40(i.clusterName) + "-SpotInterruption_event_pattern",
"aws_cloudwatch_event_rule_" + awsup.GetClusterName40(i.clusterName) + "-InstanceStateChange_event_pattern",
"aws_cloudwatch_event_rule_" + awsup.GetClusterName40(i.clusterName) + "-InstanceScheduledChange_event_pattern",
"aws_sqs_queue_" + strings.Replace(i.clusterName, ".", "-", -1) + "-nth_policy",
}...)
}
if i.nthRebalance {
expectedFilenames = append(expectedFilenames, "aws_cloudwatch_event_rule_"+awsup.GetClusterName40(i.clusterName)+"-RebalanceRecommendation_event_pattern")
}
}
expectedFilenames = append(expectedFilenames, i.expectServiceAccountRolePolicies...)

View File

@ -31,6 +31,8 @@ with "control-plane-". The names of groups for existing clusters are unchanged.
* Node Termination Handler now defaults to Queue-Processor mode. It also now enables Scheduled Event Draining by default.
* Node Termination Handler, when in Queue-Processor mode, no longer drains on rebalance recommendations unless configured to do so.
## GCP
* The default instance type is now `e2-medium` for control-plane and worker nodes, and `e2-micro` for bastions.

View File

@ -17,17 +17,16 @@ limitations under the License.
package awsmodel
import (
"fmt"
"strings"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/eventbridge"
"k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/pkg/model"
"k8s.io/kops/upup/pkg/fi"
"k8s.io/kops/upup/pkg/fi/cloudup/awstasks"
"k8s.io/kops/upup/pkg/fi/cloudup/awsup"
"github.com/aws/aws-sdk-go/aws"
"k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/upup/pkg/fi"
)
const (
@ -52,8 +51,9 @@ type event struct {
var (
_ fi.ModelBuilder = &NodeTerminationHandlerBuilder{}
_ fi.HasDeletions = &NodeTerminationHandlerBuilder{}
events = []event{
fixedEvents = []event{
{
name: "ASGLifecycle",
pattern: `{"source":["aws.autoscaling"],"detail-type":["EC2 Instance-terminate Lifecycle Action"]}`,
@ -62,10 +62,6 @@ var (
name: "SpotInterruption",
pattern: `{"source": ["aws.ec2"],"detail-type": ["EC2 Spot Instance Interruption Warning"]}`,
},
{
name: "RebalanceRecommendation",
pattern: `{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}`,
},
{
name: "InstanceStateChange",
pattern: `{"source": ["aws.ec2"],"detail-type": ["EC2 Instance State-change Notification"]}`,
@ -75,6 +71,11 @@ var (
pattern: `{"source": ["aws.health"],"detail-type": ["AWS Health Event"],"detail": {"service": ["EC2"],"eventTypeCategory": ["scheduledChange"]}}`,
},
}
rebalanceEvent = event{
name: "RebalanceRecommendation",
pattern: `{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}`,
}
)
type NodeTerminationHandlerBuilder struct {
@ -140,6 +141,12 @@ func (b *NodeTerminationHandlerBuilder) build(c *fi.ModelBuilderContext) error {
clusterName := b.ClusterName()
clusterNamePrefix := awsup.GetClusterName40(clusterName)
events := append([]event(nil), fixedEvents...)
if b.Cluster.Spec.NodeTerminationHandler != nil && fi.ValueOf(b.Cluster.Spec.NodeTerminationHandler.EnableRebalanceDraining) {
events = append(events, rebalanceEvent)
}
for _, event := range events {
// build rule
ruleName := aws.String(clusterNamePrefix + "-" + event.name)
@ -170,3 +177,55 @@ func (b *NodeTerminationHandlerBuilder) build(c *fi.ModelBuilderContext) error {
return nil
}
func (b *NodeTerminationHandlerBuilder) FindDeletions(c *fi.ModelBuilderContext, cloud fi.Cloud) error {
if b.Cluster.Spec.NodeTerminationHandler != nil && fi.ValueOf(b.Cluster.Spec.NodeTerminationHandler.EnableRebalanceDraining) {
return nil
}
clusterName := b.ClusterName()
clusterNamePrefix := awsup.GetClusterName40(clusterName)
ruleName := aws.String(clusterNamePrefix + "-" + rebalanceEvent.name)
eventBridge := cloud.(awsup.AWSCloud).EventBridge()
request := &eventbridge.ListRulesInput{
NamePrefix: ruleName,
}
response, err := eventBridge.ListRules(request)
if err != nil {
return fmt.Errorf("listing EventBridge rules: %w", err)
}
if response == nil || len(response.Rules) == 0 {
return nil
}
if len(response.Rules) > 1 {
return fmt.Errorf("found multiple EventBridge rules with the same name %s", *ruleName)
}
rule := response.Rules[0]
tagResponse, err := eventBridge.ListTagsForResource(&eventbridge.ListTagsForResourceInput{ResourceARN: rule.Arn})
if err != nil {
return fmt.Errorf("listing tags for EventBridge rule: %w", err)
}
owned := false
ownershipTag := "kubernetes.io/cluster/" + b.Cluster.ObjectMeta.Name
for _, tag := range tagResponse.Tags {
if fi.ValueOf(tag.Key) == ownershipTag && fi.ValueOf(tag.Value) == "owned" {
owned = true
break
}
}
if !owned {
return nil
}
ruleTask := &awstasks.EventBridgeRule{
Name: ruleName,
Lifecycle: b.Lifecycle,
}
c.AddTask(ruleTask)
return nil
}

View File

@ -39,37 +39,42 @@ func DumpEventBridgeRule(op *resources.DumpOperation, r *resources.Resource) err
return nil
}
func DeleteEventBridgeRule(cloud fi.Cloud, r *resources.Resource) error {
func EventBridgeRuleDeleter(cloud fi.Cloud, r *resources.Resource) error {
return DeleteEventBridgeRule(cloud, r.Name)
}
func DeleteEventBridgeRule(cloud fi.Cloud, ruleName string) error {
c := cloud.(awsup.AWSCloud)
targets, err := c.EventBridge().ListTargetsByRule(&eventbridge.ListTargetsByRuleInput{
Rule: aws.String(r.Name),
Rule: aws.String(ruleName),
})
if err != nil {
return fmt.Errorf("error listing targets for EventBridge rule %q: %v", r.Name, err)
return fmt.Errorf("listing targets for EventBridge rule %q: %w", ruleName, err)
}
if len(targets.Targets) > 0 {
var ids []*string
for _, target := range targets.Targets {
ids = append(ids, target.Id)
}
klog.V(2).Infof("Removing EventBridge Targets for rule %q", r.Name)
klog.V(2).Infof("Removing EventBridge Targets for rule %q", ruleName)
_, err = c.EventBridge().RemoveTargets(&eventbridge.RemoveTargetsInput{
Ids: ids,
Rule: aws.String(r.Name),
Rule: aws.String(ruleName),
})
if err != nil {
return fmt.Errorf("error removing targets for EventBridge rule %q: %v", r.Name, err)
return fmt.Errorf("removing targets for EventBridge rule %q: %w", ruleName, err)
}
}
klog.V(2).Infof("Deleting EventBridge rule %q", r.Name)
klog.V(2).Infof("Deleting EventBridge rule %q", ruleName)
request := &eventbridge.DeleteRuleInput{
Name: aws.String(r.Name),
Name: aws.String(ruleName),
}
_, err = c.EventBridge().DeleteRule(request)
if err != nil {
return fmt.Errorf("error deleting EventBridge rule %q: %v", r.Name, err)
return fmt.Errorf("deleting EventBridge rule %q: %w", ruleName, err)
}
return nil
}
@ -101,7 +106,7 @@ func ListEventBridgeRules(cloud fi.Cloud, clusterName string) ([]*resources.Reso
Name: *rule.Name,
ID: *rule.Name,
Type: "eventbridge",
Deleter: DeleteEventBridgeRule,
Deleter: EventBridgeRuleDeleter,
Dumper: DumpEventBridgeRule,
Obj: rule,
}

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -341,16 +341,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -376,11 +366,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -341,16 +341,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -376,11 +366,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -326,16 +326,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -361,11 +351,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -326,16 +326,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -361,11 +351,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -326,16 +326,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -361,11 +351,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -210,7 +210,7 @@ spec:
amazonvpc: {}
nodeTerminationHandler:
cpuRequest: 50m
enableRebalanceDraining: false
enableRebalanceDraining: true
enableRebalanceMonitoring: false
enableScheduledEventDraining: true
enableSpotInterruptionDraining: true

View File

@ -62,7 +62,7 @@ spec:
version: 9.99.0
- id: k8s-1.11
manifest: node-termination-handler.aws/k8s-1.11.yaml
manifestHash: 031dd270cecb6247a363cebba3efb61ebb940315373f929a9ba2aee8e249ddfd
manifestHash: 7cd05c5a73220c7db013f0c2e6322b23c989affed13804a221e98b82c11a60c6
name: node-termination-handler.aws
prune:
kinds:

View File

@ -206,7 +206,7 @@ spec:
- name: ENABLE_REBALANCE_MONITORING
value: "false"
- name: ENABLE_REBALANCE_DRAINING
value: "false"
value: "true"
- name: ENABLE_SQS_TERMINATION_DRAINING
value: "true"
- name: QUEUE_URL

View File

@ -41,6 +41,7 @@ spec:
amazonvpc: {}
nodeTerminationHandler:
enabled: true
enableRebalanceDraining: true
nonMasqueradeCIDR: 172.20.0.0/16
snapshotController:
enabled: true

View File

@ -1 +0,0 @@
{"source": ["aws.ec2"],"detail-type": ["EC2 Instance Rebalance Recommendation"]}

View File

@ -271,16 +271,6 @@ resource "aws_cloudwatch_event_rule" "minimal-example-com-InstanceStateChange" {
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-RebalanceRecommendation" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-RebalanceRecommendation_event_pattern")
name = "minimal.example.com-RebalanceRecommendation"
tags = {
"KubernetesCluster" = "minimal.example.com"
"Name" = "minimal.example.com-RebalanceRecommendation"
"kubernetes.io/cluster/minimal.example.com" = "owned"
}
}
resource "aws_cloudwatch_event_rule" "minimal-example-com-SpotInterruption" {
event_pattern = file("${path.module}/data/aws_cloudwatch_event_rule_minimal.example.com-SpotInterruption_event_pattern")
name = "minimal.example.com-SpotInterruption"
@ -306,11 +296,6 @@ resource "aws_cloudwatch_event_target" "minimal-example-com-InstanceStateChange-
rule = aws_cloudwatch_event_rule.minimal-example-com-InstanceStateChange.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-RebalanceRecommendation-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-RebalanceRecommendation.id
}
resource "aws_cloudwatch_event_target" "minimal-example-com-SpotInterruption-Target" {
arn = aws_sqs_queue.minimal-example-com-nth.arn
rule = aws_cloudwatch_event_rule.minimal-example-com-SpotInterruption.id

View File

@ -22,6 +22,7 @@ import (
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/eventbridge"
"k8s.io/apimachinery/pkg/util/validation/field"
awsResources "k8s.io/kops/pkg/resources/aws"
"k8s.io/kops/upup/pkg/fi"
"k8s.io/kops/upup/pkg/fi/cloudup/awsup"
"k8s.io/kops/upup/pkg/fi/cloudup/terraform"
@ -100,6 +101,10 @@ func (_ *EventBridgeRule) CheckChanges(a, e, changes *EventBridgeRule) error {
}
func (eb *EventBridgeRule) RenderAWS(t *awsup.AWSAPITarget, a, e, changes *EventBridgeRule) error {
if e.EventPattern == nil {
return awsResources.DeleteEventBridgeRule(t.Cloud, *e.Name)
}
if a == nil {
var tags []*eventbridge.Tag
for k, v := range eb.Tags {