Add cloud provider for Rancher with RKE2

Signed-off-by: Cyrill Troxler <cyrill@nine.ch>
This commit is contained in:
Cyrill Troxler 2022-06-13 11:12:03 +02:00
parent 2ec7600e79
commit 52ab8ed545
15 changed files with 1962 additions and 2 deletions

View File

@ -33,6 +33,7 @@ You should also take a look at the notes and "gotchas" for your specific cloud p
* [Vultr](./cloudprovider/vultr/README.md)
* [TencentCloud](./cloudprovider/tencentcloud/README.md)
* [Scaleway](./cloudprovider/scaleway/README.md)
* [Rancher](./cloudprovider/rancher/README.md)
# Releases
@ -180,3 +181,4 @@ Supported cloud providers:
* TencentCloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/tencentcloud/README.md
* BaiduCloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/baiducloud/README.md
* Huawei Cloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/huaweicloud/README.md
* Rancher https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/rancher/README.md

View File

@ -1,5 +1,5 @@
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci && !vultr && !tencentcloud && !scaleway && !externalgrpc && !civo
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci,!vultr,!tencentcloud,!scaleway,!externalgrpc,!civo
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci && !vultr && !tencentcloud && !scaleway && !externalgrpc && !civo && !rancher
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci,!vultr,!tencentcloud,!scaleway,!externalgrpc,!civo,!rancher
/*
Copyright 2018 The Kubernetes Authors.
@ -43,6 +43,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ovhcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/packet"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/scaleway"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/vultr"
@ -76,6 +77,7 @@ var AvailableCloudProviders = []string{
cloudprovider.TencentcloudProviderName,
cloudprovider.CivoProviderName,
cloudprovider.ScalewayProviderName,
cloudprovider.RancherProviderName,
}
// DefaultCloudProvider is GCE.
@ -133,6 +135,8 @@ func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGro
return civo.BuildCivo(opts, do, rl)
case cloudprovider.ScalewayProviderName:
return scaleway.BuildScaleway(opts, do, rl)
case cloudprovider.RancherProviderName:
return rancher.BuildRancher(opts, do, rl)
}
return nil
}

View File

@ -0,0 +1,43 @@
//go:build rancher
// +build rancher
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package builder
import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher"
"k8s.io/autoscaler/cluster-autoscaler/config"
)
// AvailableCloudProviders supported by the cloud provider builder.
var AvailableCloudProviders = []string{
cloudprovider.RancherProviderName,
}
// DefaultCloudProvider for rancher-only build is rancher.
const DefaultCloudProvider = cloudprovider.RancherProviderName
func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
switch opts.CloudProviderName {
case cloudprovider.RancherProviderName:
return rancher.BuildRancher(opts, do, rl)
}
return nil
}

View File

@ -80,6 +80,8 @@ const (
ExternalGrpcProviderName = "externalgrpc"
// CivoProviderName gets the provider name of civo
CivoProviderName = "civo"
// RancherProviderName gets the provider name of rancher
RancherProviderName = "rancher"
)
// CloudProvider contains configuration info and functions for interacting with

View File

@ -0,0 +1,10 @@
approvers:
#- ctrox
#- gajicdev
#- pawelkuc
#- thirdeyenick
reviewers:
#- ctrox
#- gajicdev
#- pawelkuc
#- thirdeyenick

View File

@ -0,0 +1,74 @@
# Cluster Autoscaler for Rancher with RKE2
This cluster autoscaler for Rancher scales nodes in clusters which use RKE2
provisioning (Rancher v2.6+). It uses a combination of the Rancher API and the
underlying cluster-api types of RKE2.
## Configuration
The `cluster-autoscaler` for Rancher needs a configuration file to work by
using `--cloud-config` parameter. An up-to-date example can be found in
[examples/config.yaml](./examples/config.yaml).
### Permissions
The Rancher server account provided in the `cloud-config` requires the
following permissions on the Rancher server:
* Get/Update of the `clusters.provisioning.cattle.io` resource to autoscale
* List of `machines.cluster.x-k8s.io` in the namespace of the cluster resource
## Running the Autoscaler
The `cluster-autoscaler` can be run inside the RKE2 cluster, on the Rancher
server cluster or on a completely separate machine. To run it outside the RKE2
cluster, make sure to provide a kubeconfig with `--kubeconfig`.
To start the autoscaler with the Rancher provider, the cloud provider needs to
be specified:
```bash
cluster-autoscaler --cloud-provider=rancher --cloud-config=config.yaml
```
## Enabling Autoscaling
In order for the autoscaler to function, the RKE2 cluster needs to be
configured accordingly. The autoscaler works by adjusting the `quantity` of a
`machinePool` dynamically. For the autoscaler to know the min/max size of a
`machinePool` we need to set a few annotations using the
`machineDeploymentAnnotations` field. That field has been chosen because
updating it does not trigger a full rollout of a `machinePool`.
```yaml
apiVersion: provisioning.cattle.io/v1
kind: Cluster
spec:
rkeConfig:
machinePools:
- name: pool-1
quantity: 1
workerRole: true
machineDeploymentAnnotations:
cluster.provisioning.cattle.io/autoscaler-min-size: "1"
cluster.provisioning.cattle.io/autoscaler-max-size: "3"
```
Optionally in order to enable scaling a `machinePool` from and to 0 nodes, we
need to add a few more annotations to let the autoscaler know, which resources
a single node in a pool provides:
```yaml
apiVersion: provisioning.cattle.io/v1
kind: Cluster
spec:
rkeConfig:
machinePools:
- name: pool-1
machineDeploymentAnnotations:
cluster.provisioning.cattle.io/autoscaler-min-size: "0"
cluster.provisioning.cattle.io/autoscaler-max-size: "3"
cluster.provisioning.cattle.io/autoscaler-resource-cpu: "1"
cluster.provisioning.cattle.io/autoscaler-resource-ephemeral-storage: 50Gi
cluster.provisioning.cattle.io/autoscaler-resource-memory: 4Gi
```

View File

@ -0,0 +1,9 @@
# rancher server credentials
url: https://rancher.example.org
token: <rancher token>
# name and namespace of the clusters.provisioning.cattle.io resource on the
# rancher server
clusterName: my-cluster
clusterNamespace: fleet-default
# optional, will be auto-discovered if not specified
#clusterAPIVersion: v1alpha4

View File

@ -0,0 +1,88 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)
// RKEMachinePool configures a RKE machine pool
type RKEMachinePool struct {
RKECommonNodeConfig `json:",inline"`
Paused bool `json:"paused,omitempty"`
EtcdRole bool `json:"etcdRole,omitempty"`
ControlPlaneRole bool `json:"controlPlaneRole,omitempty"`
WorkerRole bool `json:"workerRole,omitempty"`
DrainBeforeDelete bool `json:"drainBeforeDelete,omitempty"`
DrainBeforeDeleteTimeout *metav1.Duration `json:"drainBeforeDeleteTimeout,omitempty"`
NodeConfig *corev1.ObjectReference `json:"machineConfigRef,omitempty" wrangler:"required"`
Name string `json:"name,omitempty" wrangler:"required"`
DisplayName string `json:"displayName,omitempty"`
Quantity *int32 `json:"quantity,omitempty"`
RollingUpdate *RKEMachinePoolRollingUpdate `json:"rollingUpdate,omitempty"`
MachineDeploymentLabels map[string]string `json:"machineDeploymentLabels,omitempty"`
MachineDeploymentAnnotations map[string]string `json:"machineDeploymentAnnotations,omitempty"`
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
UnhealthyNodeTimeout *metav1.Duration `json:"unhealthyNodeTimeout,omitempty"`
MaxUnhealthy *string `json:"maxUnhealthy,omitempty"`
UnhealthyRange *string `json:"unhealthyRange,omitempty"`
MachineOS string `json:"machineOS,omitempty"`
}
// RKEMachinePoolRollingUpdate configures the rolling update of a machine pool
type RKEMachinePoolRollingUpdate struct {
// The maximum number of machines that can be unavailable during the update.
// Value can be an absolute number (ex: 5) or a percentage of desired
// machines (ex: 10%).
// Absolute number is calculated from percentage by rounding down.
// This can not be 0 if MaxSurge is 0.
// Defaults to 0.
// Example: when this is set to 30%, the old MachineSet can be scaled
// down to 70% of desired machines immediately when the rolling update
// starts. Once new machines are ready, old MachineSet can be scaled
// down further, followed by scaling up the new MachineSet, ensuring
// that the total number of machines available at all times
// during the update is at least 70% of desired machines.
// +optional
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
// The maximum number of machines that can be scheduled above the
// desired number of machines.
// Value can be an absolute number (ex: 5) or a percentage of
// desired machines (ex: 10%).
// This can not be 0 if MaxUnavailable is 0.
// Absolute number is calculated from percentage by rounding up.
// Defaults to 1.
// Example: when this is set to 30%, the new MachineSet can be scaled
// up immediately when the rolling update starts, such that the total
// number of old and new machines do not exceed 130% of desired
// machines. Once old machines have been killed, new MachineSet can
// be scaled up further, ensuring that total number of machines running
// at any time during the update is at most 130% of desired machines.
// +optional
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}
// RKECommonNodeConfig contains common node configuration
type RKECommonNodeConfig struct {
Labels map[string]string `json:"labels,omitempty"`
Taints []corev1.Taint `json:"taints,omitempty"`
CloudCredentialSecretName string `json:"cloudCredentialSecretName,omitempty"`
}

View File

@ -0,0 +1,57 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"fmt"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/discovery"
)
const (
clusterAPIGroup = "cluster.x-k8s.io"
machineDeleteAnnotationKey = clusterAPIGroup + "/delete-machine"
machinePhaseProvisioning = "Provisioning"
machinePhasePending = "Pending"
machinePhaseDeleting = "Deleting"
machineDeploymentNameLabelKey = clusterAPIGroup + "/deployment-name"
machineResourceName = "machines"
)
func getAPIGroupPreferredVersion(client discovery.DiscoveryInterface, apiGroup string) (string, error) {
groupList, err := client.ServerGroups()
if err != nil {
return "", fmt.Errorf("failed to get ServerGroups: %v", err)
}
for _, group := range groupList.Groups {
if group.Name == apiGroup {
return group.PreferredVersion.Version, nil
}
}
return "", fmt.Errorf("failed to find API group %q", apiGroup)
}
func machineGVR(version string) schema.GroupVersionResource {
return schema.GroupVersionResource{
Group: clusterAPIGroup,
Version: version,
Resource: machineResourceName,
}
}

View File

@ -0,0 +1,46 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"fmt"
"os"
"gopkg.in/yaml.v2"
)
type cloudConfig struct {
URL string `yaml:"url"`
Token string `yaml:"token"`
ClusterName string `yaml:"clusterName"`
ClusterNamespace string `yaml:"clusterNamespace"`
ClusterAPIVersion string `yaml:"clusterAPIVersion"`
}
func newConfig(file string) (*cloudConfig, error) {
b, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("unable to read cloud config file: %w", err)
}
config := &cloudConfig{}
if err := yaml.Unmarshal(b, config); err != nil {
return nil, fmt.Errorf("unable to unmarshal config file: %w", err)
}
return config, nil
}

View File

@ -0,0 +1,42 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import "testing"
func TestNewConfig(t *testing.T) {
cfg, err := newConfig("./examples/config.yaml")
if err != nil {
t.Fatal(err)
}
if len(cfg.URL) == 0 {
t.Fatal("expected url to be set")
}
if len(cfg.Token) == 0 {
t.Fatal("expected token to be set")
}
if len(cfg.ClusterName) == 0 {
t.Fatal("expected cluster name to be set")
}
if len(cfg.ClusterNamespace) == 0 {
t.Fatal("expected cluster namespace to be set")
}
}

View File

@ -0,0 +1,491 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"context"
"errors"
"fmt"
"math/rand"
"strconv"
"time"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
provisioningv1 "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher/provisioning.cattle.io/v1"
"k8s.io/autoscaler/cluster-autoscaler/config"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/utils/pointer"
)
// nodeGroup implements nodeGroup for rancher machine pools.
type nodeGroup struct {
provider *RancherCloudProvider
name string
labels map[string]string
taints []corev1.Taint
minSize int
maxSize int
resources corev1.ResourceList
replicas int
}
type node struct {
instance cloudprovider.Instance
machine unstructured.Unstructured
}
var (
// errMissingMinSizeAnnotation is the error returned when a machine pool does
// not have the min size annotations attached.
errMissingMinSizeAnnotation = errors.New("missing min size annotation")
// errMissingMaxSizeAnnotation is the error returned when a machine pool does
// not have the max size annotations attached.
errMissingMaxSizeAnnotation = errors.New("missing max size annotation")
// errMissingResourceAnnotation is the error returned when a machine pool does
// not have all the resource annotations attached.
errMissingResourceAnnotation = errors.New("missing resource annotation")
)
const podCapacity = 110
// Id returns node group id/name.
func (ng *nodeGroup) Id() string {
return ng.name
}
// MinSize returns minimum size of the node group.
func (ng *nodeGroup) MinSize() int {
return ng.minSize
}
// MaxSize returns maximum size of the node group.
func (ng *nodeGroup) MaxSize() int {
return ng.maxSize
}
// Debug returns a debug string for the node group.
func (ng *nodeGroup) Debug() string {
return fmt.Sprintf("%s (%d:%d)", ng.Id(), ng.MinSize(), ng.MaxSize())
}
// Nodes returns a list of all nodes that belong to this node group.
func (ng *nodeGroup) Nodes() ([]cloudprovider.Instance, error) {
nodes, err := ng.nodes()
if err != nil {
return nil, err
}
instances := make([]cloudprovider.Instance, 0, len(nodes))
for _, node := range nodes {
instances = append(instances, node.instance)
}
return instances, nil
}
// DeleteNodes deletes the specified nodes from the node group.
func (ng *nodeGroup) DeleteNodes(toDelete []*corev1.Node) error {
if ng.replicas-len(toDelete) < ng.MinSize() {
return fmt.Errorf("node group size would be below minimum size - desired: %d, min: %d",
ng.replicas-len(toDelete), ng.MinSize())
}
for _, del := range toDelete {
node, err := ng.findNodeByProviderID(rke2ProviderIDPrefix + del.Name)
if err != nil {
return err
}
klog.V(4).Infof("marking machine for deletion: %v", node.instance.Id)
if err := node.markMachineForDeletion(ng); err != nil {
return fmt.Errorf("unable to mark machine %s for deletion: %w", del.Name, err)
}
if err := ng.setSize(ng.replicas - 1); err != nil {
// rollback deletion mark
_ = node.unmarkMachineForDeletion(ng)
return fmt.Errorf("unable to set node group size: %w", err)
}
}
return nil
}
func (ng *nodeGroup) findNodeByProviderID(providerID string) (*node, error) {
nodes, err := ng.nodes()
if err != nil {
return nil, err
}
for _, node := range nodes {
if node.instance.Id == providerID {
return &node, nil
}
}
return nil, fmt.Errorf("node with providerID %s not found in node group %s", providerID, ng.name)
}
// IncreaseSize increases NodeGroup size.
func (ng *nodeGroup) IncreaseSize(delta int) error {
if delta <= 0 {
return fmt.Errorf("size increase must be positive")
}
newSize := ng.replicas + delta
if newSize > ng.MaxSize() {
return fmt.Errorf("size increase too large, desired: %d max: %d", newSize, ng.MaxSize())
}
return ng.setSize(newSize)
}
// TargetSize returns the current TARGET size of the node group. It is possible that the
// number is different from the number of nodes registered in Kubernetes.
func (ng *nodeGroup) TargetSize() (int, error) {
return ng.replicas, nil
}
// DecreaseTargetSize decreases the target size of the node group. This function
// doesn't permit to delete any existing node and can be used only to reduce the
// request for new nodes that have not been yet fulfilled. Delta should be negative.
func (ng *nodeGroup) DecreaseTargetSize(delta int) error {
if delta >= 0 {
return fmt.Errorf("size decrease must be negative")
}
nodes, err := ng.Nodes()
if err != nil {
return fmt.Errorf("failed to get node group nodes: %w", err)
}
if ng.replicas+delta < len(nodes) {
return fmt.Errorf("attempt to delete existing nodes targetSize: %d delta: %d existingNodes: %d",
ng.replicas, delta, len(nodes))
}
return ng.setSize(ng.replicas + delta)
}
// TemplateNodeInfo returns a node template for this node group.
func (ng *nodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%s-%d", ng.provider.config.ClusterName, ng.Id(), rand.Int63()),
Labels: ng.labels,
},
Spec: corev1.NodeSpec{
Taints: ng.taints,
},
Status: corev1.NodeStatus{
Capacity: ng.resources,
Conditions: cloudprovider.BuildReadyConditions(),
},
}
node.Status.Capacity[corev1.ResourcePods] = *resource.NewQuantity(podCapacity, resource.DecimalSI)
node.Status.Allocatable = node.Status.Capacity
// Setup node info template
nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(ng.Id()))
nodeInfo.SetNode(node)
return nodeInfo, nil
}
// Exist checks if the node group really exists on the cloud provider side.
func (ng *nodeGroup) Exist() bool {
return ng.Id() != ""
}
// Create creates the node group on the cloud provider side.
func (ng *nodeGroup) Create() (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}
// Delete deletes the node group on the cloud provider side.
func (ng *nodeGroup) Delete() error {
return cloudprovider.ErrNotImplemented
}
// Autoprovisioned returns true if the node group is autoprovisioned.
func (ng *nodeGroup) Autoprovisioned() bool {
return false
}
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
// NodeGroup. Returning a nil will result in using default options.
func (ng *nodeGroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
return nil, cloudprovider.ErrNotImplemented
}
func (ng *nodeGroup) setSize(size int) error {
machinePools, err := ng.provider.getMachinePools()
if err != nil {
return err
}
found := false
for i := range machinePools {
if machinePools[i].Name == ng.name {
machinePools[i].Quantity = pointer.Int32Ptr(int32(size))
found = true
break
}
}
if !found {
return fmt.Errorf("unable to set size of group %s of cluster %s: group not found",
ng.name, ng.provider.config.ClusterName)
}
if err := ng.provider.updateMachinePools(machinePools); err != nil {
return err
}
ng.replicas = size
return nil
}
// nodes returns all nodes of this node group that have a provider ID set by
// getting the underlying machines and extracting the providerID, which
// corresponds to the name of the k8s node object.
func (ng *nodeGroup) nodes() ([]node, error) {
machines, err := ng.machines()
if err != nil {
return nil, err
}
nodes := make([]node, 0, len(machines))
for _, machine := range machines {
phase, found, err := unstructured.NestedString(machine.UnstructuredContent(), "status", "phase")
if err != nil {
return nil, err
}
if !found {
return nil, fmt.Errorf("machine %s/%s does not have status.phase field", machine.GetName(), machine.GetNamespace())
}
providerID, found, err := unstructured.NestedString(machine.UnstructuredContent(), "spec", "providerID")
if err != nil {
return nil, err
}
if !found {
if phase == machinePhaseProvisioning {
// if the provider ID is missing during provisioning, we
// ignore this node to avoid errors in the autoscaler.
continue
}
return nil, fmt.Errorf("could not find providerID in machine: %s/%s", machine.GetName(), machine.GetNamespace())
}
state := cloudprovider.InstanceRunning
switch phase {
case machinePhasePending, machinePhaseProvisioning:
state = cloudprovider.InstanceCreating
case machinePhaseDeleting:
state = cloudprovider.InstanceDeleting
}
nodes = append(nodes, node{
machine: machine,
instance: cloudprovider.Instance{
Id: providerID,
Status: &cloudprovider.InstanceStatus{
State: state,
},
},
})
}
return nodes, nil
}
// machines returns the unstructured objects of all cluster-api machines in a
// node group. The machines are found using the deployment name label.
func (ng *nodeGroup) machines() ([]unstructured.Unstructured, error) {
machinesList, err := ng.provider.client.Resource(machineGVR(ng.provider.config.ClusterAPIVersion)).
Namespace(ng.provider.config.ClusterNamespace).List(
context.TODO(), metav1.ListOptions{
// we find all machines belonging to an rke2 machinePool by the
// deployment name, since it is just <cluster name>-<machinePool name>
LabelSelector: fmt.Sprintf("%s=%s-%s", machineDeploymentNameLabelKey, ng.provider.config.ClusterName, ng.name),
},
)
return machinesList.Items, err
}
// markMachineForDeletion sets an annotation on the cluster-api machine
// object, inidicating that this node is a candidate to be removed on scale
// down of the controlling resource (machineSet/machineDeployment).
func (n *node) markMachineForDeletion(ng *nodeGroup) error {
u, err := ng.provider.client.Resource(machineGVR(ng.provider.config.ClusterAPIVersion)).Namespace(n.machine.GetNamespace()).
Get(context.TODO(), n.machine.GetName(), metav1.GetOptions{})
if err != nil {
return err
}
u = u.DeepCopy()
annotations := u.GetAnnotations()
if annotations == nil {
annotations = map[string]string{}
}
annotations[machineDeleteAnnotationKey] = time.Now().String()
u.SetAnnotations(annotations)
_, err = ng.provider.client.Resource(machineGVR(ng.provider.config.ClusterAPIVersion)).Namespace(u.GetNamespace()).
Update(context.TODO(), u, metav1.UpdateOptions{})
return err
}
// unmarkMachineForDeletion removes the machine delete annotation.
func (n *node) unmarkMachineForDeletion(ng *nodeGroup) error {
u, err := ng.provider.client.Resource(machineGVR(ng.provider.config.ClusterAPIVersion)).Namespace(n.machine.GetNamespace()).
Get(context.TODO(), n.machine.GetName(), metav1.GetOptions{})
if err != nil {
return err
}
u = u.DeepCopy()
annotations := u.GetAnnotations()
if annotations == nil {
annotations = map[string]string{}
}
delete(annotations, machineDeleteAnnotationKey)
u.SetAnnotations(annotations)
_, err = ng.provider.client.Resource(machineGVR(ng.provider.config.ClusterAPIVersion)).Namespace(u.GetNamespace()).
Update(context.TODO(), u, metav1.UpdateOptions{})
return err
}
func newNodeGroupFromMachinePool(provider *RancherCloudProvider, machinePool provisioningv1.RKEMachinePool) (*nodeGroup, error) {
if machinePool.Quantity == nil {
return nil, errors.New("machine pool quantity is not set")
}
minSize, maxSize, err := parseScalingAnnotations(machinePool.MachineDeploymentAnnotations)
if err != nil {
return nil, fmt.Errorf("error parsing scaling annotations: %w", err)
}
resources, err := parseResourceAnnotations(machinePool.MachineDeploymentAnnotations)
if err != nil {
if !errors.Is(err, errMissingResourceAnnotation) {
return nil, fmt.Errorf("error parsing resource annotations: %w", err)
}
// if the resource labels are missing, we simply initialize an empty
// list. The autoscaler can still work but won't scale up from 0 if a
// pod requests any resources.
resources = corev1.ResourceList{}
}
return &nodeGroup{
provider: provider,
name: machinePool.Name,
labels: machinePool.Labels,
taints: machinePool.Taints,
minSize: minSize,
maxSize: maxSize,
replicas: int(*machinePool.Quantity),
resources: resources,
}, nil
}
func parseResourceAnnotations(annotations map[string]string) (corev1.ResourceList, error) {
cpu, ok := annotations[resourceCPUAnnotation]
if !ok {
return nil, errMissingResourceAnnotation
}
cpuResources, err := resource.ParseQuantity(cpu)
if err != nil {
return nil, fmt.Errorf("unable to parse cpu resources: %s", cpu)
}
memory, ok := annotations[resourceMemoryAnnotation]
if !ok {
return nil, errMissingResourceAnnotation
}
memoryResources, err := resource.ParseQuantity(memory)
if err != nil {
return nil, fmt.Errorf("unable to parse cpu resources: %s", cpu)
}
ephemeralStorage, ok := annotations[resourceEphemeralStorageAnnotation]
if !ok {
return nil, errMissingResourceAnnotation
}
ephemeralStorageResources, err := resource.ParseQuantity(ephemeralStorage)
if err != nil {
return nil, fmt.Errorf("unable to parse cpu resources: %s", cpu)
}
return corev1.ResourceList{
corev1.ResourceCPU: cpuResources,
corev1.ResourceMemory: memoryResources,
corev1.ResourceEphemeralStorage: ephemeralStorageResources,
}, nil
}
func parseScalingAnnotations(annotations map[string]string) (int, int, error) {
min, ok := annotations[minSizeAnnotation]
if !ok {
return 0, 0, errMissingMinSizeAnnotation
}
minSize, err := strconv.Atoi(min)
if err != nil {
return 0, 0, fmt.Errorf("unable to parse min size: %s", min)
}
max, ok := annotations[maxSizeAnnotation]
if !ok {
return 0, 0, errMissingMaxSizeAnnotation
}
maxSize, err := strconv.Atoi(max)
if err != nil {
return 0, 0, fmt.Errorf("unable to parse min size: %s", min)
}
if minSize < 0 || maxSize < 0 {
return 0, 0, fmt.Errorf("invalid min or max size supplied: %v/%v", minSize, maxSize)
}
return minSize, maxSize, nil
}

View File

@ -0,0 +1,567 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"fmt"
"reflect"
"strings"
"testing"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
provisioningv1 "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher/provisioning.cattle.io/v1"
fakedynamic "k8s.io/client-go/dynamic/fake"
"k8s.io/utils/pointer"
)
const (
testCluster = "test-cluster"
testNamespace = "default"
nodeGroupDev = "dev"
nodeGroupProd = "ng-long-prod"
)
func TestNodeGroupNodes(t *testing.T) {
tests := []struct {
name string
nodeGroup nodeGroup
expectedNodes int
expectedErrContains string
machines func() []runtime.Object
}{
{
name: "normal",
nodeGroup: nodeGroup{name: nodeGroupDev},
expectedNodes: 2,
machines: func() []runtime.Object {
return []runtime.Object{
newMachine(nodeGroupDev, 0),
newMachine(nodeGroupDev, 1),
}
},
},
{
name: "mixed machines",
nodeGroup: nodeGroup{name: nodeGroupDev},
expectedNodes: 3,
machines: func() []runtime.Object {
return []runtime.Object{
newMachine(nodeGroupDev, 0),
newMachine(nodeGroupDev, 1),
newMachine(nodeGroupDev, 2),
newMachine(nodeGroupProd, 0),
newMachine(nodeGroupProd, 1),
}
},
},
{
name: "no matching machines",
nodeGroup: nodeGroup{name: nodeGroupDev},
expectedNodes: 0,
machines: func() []runtime.Object {
return []runtime.Object{
newMachine(nodeGroupProd, 0),
newMachine(nodeGroupProd, 1),
}
},
},
{
name: "machine without provider id",
nodeGroup: nodeGroup{name: nodeGroupDev},
expectedNodes: 0,
expectedErrContains: "could not find providerID in machine",
machines: func() []runtime.Object {
machine := newMachine(nodeGroupDev, 0)
_ = unstructured.SetNestedMap(machine.Object, map[string]interface{}{}, "spec")
return []runtime.Object{machine}
},
},
{
name: "machine without provider id during provisioning",
nodeGroup: nodeGroup{name: nodeGroupDev},
expectedNodes: 1,
machines: func() []runtime.Object {
machineProvisioning := newMachine(nodeGroupDev, 0)
_ = unstructured.SetNestedMap(machineProvisioning.Object, map[string]interface{}{}, "spec")
_ = unstructured.SetNestedField(machineProvisioning.Object, machinePhaseProvisioning, "status", "phase")
return []runtime.Object{
machineProvisioning,
newMachine(nodeGroupDev, 1),
}
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
provider, err := setup(tc.machines())
if err != nil {
t.Fatal(err)
}
tc.nodeGroup.provider = provider
nodes, err := tc.nodeGroup.Nodes()
if err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
}
if len(nodes) != tc.expectedNodes {
t.Fatalf("expected %v nodes, got %v", tc.expectedNodes, len(nodes))
}
})
}
}
func TestNodeGroupDeleteNodes(t *testing.T) {
tests := []struct {
name string
nodeGroup nodeGroup
expectedTargetSize int
expectedErrContains string
machines []runtime.Object
toDelete []*corev1.Node
}{
{
name: "delete node",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 1,
minSize: 0,
maxSize: 2,
},
expectedTargetSize: 0,
machines: []runtime.Object{newMachine(nodeGroupDev, 0)},
toDelete: []*corev1.Node{
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 0)}},
},
},
{
name: "delete multiple nodes",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 3,
minSize: 0,
maxSize: 3,
},
expectedTargetSize: 1,
machines: []runtime.Object{newMachine(nodeGroupDev, 0), newMachine(nodeGroupDev, 1), newMachine(nodeGroupDev, 2)},
toDelete: []*corev1.Node{
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 0)}},
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 2)}},
},
},
{
name: "delete unknown node",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 1,
minSize: 0,
maxSize: 2,
},
expectedTargetSize: 1,
expectedErrContains: fmt.Sprintf("node with providerID rke2://%s not found in node group %s", nodeName(nodeGroupDev, 42), nodeGroupDev),
machines: []runtime.Object{newMachine(nodeGroupDev, 0)},
toDelete: []*corev1.Node{
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 42)}},
},
},
{
name: "delete more nodes than min size",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 2,
minSize: 1,
maxSize: 2,
},
expectedTargetSize: 2,
expectedErrContains: "node group size would be below minimum size - desired: 0, min: 1",
machines: []runtime.Object{newMachine(nodeGroupDev, 0), newMachine(nodeGroupDev, 1)},
toDelete: []*corev1.Node{
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 0)}},
{ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 1)}},
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
provider, err := setup(tc.machines)
if err != nil {
t.Fatal(err)
}
tc.nodeGroup.provider = provider
if err := provider.Refresh(); err != nil {
t.Fatal(err)
}
// store delta before deleting nodes
delta := tc.nodeGroup.replicas - tc.expectedTargetSize
if err := tc.nodeGroup.DeleteNodes(tc.toDelete); err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
}
targetSize, err := tc.nodeGroup.TargetSize()
if err != nil {
t.Fatal(err)
}
if tc.expectedTargetSize != targetSize {
t.Fatalf("expected target size %v, got %v", tc.expectedTargetSize, targetSize)
}
machines, err := tc.nodeGroup.machines()
if err != nil {
t.Fatal(err)
}
annotationCount := 0
for _, machine := range machines {
if _, ok := machine.GetAnnotations()[machineDeleteAnnotationKey]; ok {
annotationCount++
}
}
if annotationCount != delta {
t.Fatalf("expected %v machines to have the deleted annotation, got %v", delta, annotationCount)
}
})
}
}
func TestIncreaseTargetSize(t *testing.T) {
tests := []struct {
name string
delta int
nodeGroup nodeGroup
expectedErrContains string
}{
{
name: "valid increase",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 0,
minSize: 0,
maxSize: 2,
},
delta: 2,
},
{
name: "too large",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 1,
minSize: 0,
maxSize: 2,
},
delta: 2,
expectedErrContains: "size increase too large, desired: 3 max: 2",
},
{
name: "negative",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 2,
minSize: 0,
maxSize: 2,
},
delta: -2,
expectedErrContains: "size increase must be positive",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
provider, err := setup(nil)
if err != nil {
t.Fatal(err)
}
tc.nodeGroup.provider = provider
if err := tc.nodeGroup.IncreaseSize(tc.delta); err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
}
})
}
}
func TestDecreaseTargetSize(t *testing.T) {
tests := []struct {
name string
delta int
nodeGroup nodeGroup
expectedErrContains string
}{
{
name: "valid decrease",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 2,
minSize: 0,
maxSize: 2,
},
delta: -2,
},
{
name: "too large",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 1,
minSize: 0,
maxSize: 2,
},
delta: -2,
expectedErrContains: "attempt to delete existing nodes targetSize: 1 delta: -2 existingNodes: 0",
},
{
name: "positive",
nodeGroup: nodeGroup{
name: nodeGroupDev,
replicas: 2,
minSize: 0,
maxSize: 2,
},
delta: 2,
expectedErrContains: "size decrease must be negative",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
provider, err := setup(nil)
if err != nil {
t.Fatal(err)
}
tc.nodeGroup.provider = provider
if err := tc.nodeGroup.DecreaseTargetSize(tc.delta); err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
}
})
}
}
func TestTemplateNodeInfo(t *testing.T) {
provider, err := setup(nil)
if err != nil {
t.Fatal(err)
}
ng := nodeGroup{
name: nodeGroupDev,
replicas: 2,
minSize: 0,
maxSize: 2,
provider: provider,
resources: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
corev1.ResourceEphemeralStorage: resource.MustParse("100Gi"),
},
}
nodeInfo, err := ng.TemplateNodeInfo()
if err != nil {
t.Fatal(err)
}
if nodeInfo.Allocatable.MilliCPU != ng.resources.Cpu().MilliValue() {
t.Fatalf("expected nodeInfo to have %v MilliCPU, got %v",
ng.resources.Cpu().MilliValue(), nodeInfo.Allocatable.MilliCPU)
}
if nodeInfo.Allocatable.Memory != ng.resources.Memory().Value() {
t.Fatalf("expected nodeInfo to have %v Memory, got %v",
ng.resources.Memory().Value(), nodeInfo.Allocatable.Memory)
}
if nodeInfo.Allocatable.EphemeralStorage != ng.resources.StorageEphemeral().Value() {
t.Fatalf("expected nodeInfo to have %v ephemeral storage, got %v",
ng.resources.StorageEphemeral().Value(), nodeInfo.Allocatable.EphemeralStorage)
}
}
func TestNewNodeGroupFromMachinePool(t *testing.T) {
provider, err := setup(nil)
if err != nil {
t.Fatal(err)
}
tests := []struct {
name string
machinePool provisioningv1.RKEMachinePool
expectedErrContains string
expectedResources corev1.ResourceList
}{
{
name: "valid",
machinePool: provisioningv1.RKEMachinePool{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
MachineDeploymentAnnotations: map[string]string{
minSizeAnnotation: "0",
maxSizeAnnotation: "3",
resourceCPUAnnotation: "2",
resourceMemoryAnnotation: "4Gi",
resourceEphemeralStorageAnnotation: "50Gi",
},
},
expectedResources: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("4Gi"),
corev1.ResourceEphemeralStorage: resource.MustParse("50Gi"),
},
},
{
name: "missing size annotations",
expectedErrContains: "missing min size annotation",
machinePool: provisioningv1.RKEMachinePool{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
MachineDeploymentAnnotations: map[string]string{
resourceCPUAnnotation: "2",
resourceMemoryAnnotation: "4Gi",
resourceEphemeralStorageAnnotation: "50Gi",
},
},
},
{
name: "missing resource annotations",
machinePool: provisioningv1.RKEMachinePool{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
MachineDeploymentAnnotations: map[string]string{
minSizeAnnotation: "0",
maxSizeAnnotation: "3",
},
},
expectedResources: corev1.ResourceList{},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ng, err := newNodeGroupFromMachinePool(provider, tc.machinePool)
if err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
return
}
if ng.replicas != int(*tc.machinePool.Quantity) {
t.Fatalf("expected nodegroup replicas %v, got %v", ng.replicas, tc.machinePool.Quantity)
}
if !reflect.DeepEqual(tc.expectedResources, ng.resources) {
t.Fatalf("expected resources %v do not match node group resources %v", tc.expectedResources, ng.resources)
}
})
}
}
func setup(machines []runtime.Object) (*RancherCloudProvider, error) {
config := &cloudConfig{
ClusterName: testCluster,
ClusterNamespace: testNamespace,
ClusterAPIVersion: "v1alpha4",
}
machinePools := []provisioningv1.RKEMachinePool{
{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
MachineDeploymentAnnotations: map[string]string{
minSizeAnnotation: "0",
maxSizeAnnotation: "3",
resourceCPUAnnotation: "2",
resourceMemoryAnnotation: "4Gi",
resourceEphemeralStorageAnnotation: "50Gi",
},
},
{
Name: nodeGroupProd,
Quantity: pointer.Int32(3),
MachineDeploymentAnnotations: map[string]string{
minSizeAnnotation: "0",
maxSizeAnnotation: "3",
resourceCPUAnnotation: "2",
resourceMemoryAnnotation: "4Gi",
resourceEphemeralStorageAnnotation: "50Gi",
},
},
}
pools, err := machinePoolsToUnstructured(machinePools)
if err != nil {
return nil, err
}
return &RancherCloudProvider{
resourceLimiter: &cloudprovider.ResourceLimiter{},
client: fakedynamic.NewSimpleDynamicClientWithCustomListKinds(
runtime.NewScheme(),
map[schema.GroupVersionResource]string{
machineGVR(config.ClusterAPIVersion): "kindList",
},
append(machines, newCluster(testCluster, testNamespace, pools))...,
),
config: config,
}, nil
}
func newMachine(nodeGroupName string, num int) *unstructured.Unstructured {
return &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": "Machine",
"apiVersion": "cluster.x-k8s.io/v1alpha4",
"metadata": map[string]interface{}{
"name": nodeName(nodeGroupName, num),
"namespace": testNamespace,
"labels": map[string]interface{}{
machineDeploymentNameLabelKey: fmt.Sprintf("%s-%s", testCluster, nodeGroupName),
},
},
"spec": map[string]interface{}{
"clusterName": testCluster,
"providerID": rke2ProviderIDPrefix + nodeName(nodeGroupName, num),
},
"status": map[string]interface{}{
"phase": "Running",
},
},
}
}
func nodeName(nodeGroupName string, num int) string {
return fmt.Sprintf("%s-%s-123456-%v", testCluster, nodeGroupName, num)
}

View File

@ -0,0 +1,312 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
autoscalererrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/client-go/discovery"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/rest"
provisioningv1 "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher/provisioning.cattle.io/v1"
klog "k8s.io/klog/v2"
)
const (
// providerName is the cloud provider name for rancher
providerName = "rancher"
// rke2ProviderID identifies nodes that are using RKE2
rke2ProviderID = "rke2"
rke2ProviderIDPrefix = rke2ProviderID + "://"
rancherProvisioningGroup = "provisioning.cattle.io"
rancherProvisioningVersion = "v1"
rancherLocalClusterPath = "/k8s/clusters/local"
minSizeAnnotation = "cluster.provisioning.cattle.io/autoscaler-min-size"
maxSizeAnnotation = "cluster.provisioning.cattle.io/autoscaler-max-size"
resourceCPUAnnotation = "cluster.provisioning.cattle.io/autoscaler-resource-cpu"
resourceMemoryAnnotation = "cluster.provisioning.cattle.io/autoscaler-resource-memory"
resourceEphemeralStorageAnnotation = "cluster.provisioning.cattle.io/autoscaler-resource-ephemeral-storage"
)
// RancherCloudProvider implements CloudProvider interface for rancher
type RancherCloudProvider struct {
resourceLimiter *cloudprovider.ResourceLimiter
client dynamic.Interface
nodeGroups []*nodeGroup
config *cloudConfig
}
// BuildRancher builds rancher cloud provider.
func BuildRancher(opts config.AutoscalingOptions, _ cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
provider, err := newRancherCloudProvider(opts.CloudConfig, rl)
if err != nil {
klog.Fatalf("failed to create rancher cloud provider: %v", err)
}
return provider
}
func newRancherCloudProvider(cloudConfig string, resourceLimiter *cloudprovider.ResourceLimiter) (*RancherCloudProvider, error) {
config, err := newConfig(cloudConfig)
if err != nil {
return nil, fmt.Errorf("unable to create cloud config: %w", err)
}
restConfig := &rest.Config{
Host: config.URL,
APIPath: rancherLocalClusterPath,
BearerToken: config.Token,
}
client, err := dynamic.NewForConfig(restConfig)
if err != nil {
return nil, fmt.Errorf("unable to create dynamic client: %w", err)
}
discovery, err := discovery.NewDiscoveryClientForConfig(restConfig)
if err != nil {
return nil, fmt.Errorf("unable to create discovery client: %w", err)
}
if config.ClusterAPIVersion == "" {
// automatically discover cluster API version
clusterAPIVersion, err := getAPIGroupPreferredVersion(discovery, clusterAPIGroup)
if err != nil {
return nil, err
}
config.ClusterAPIVersion = clusterAPIVersion
}
return &RancherCloudProvider{
resourceLimiter: resourceLimiter,
client: client,
config: config,
}, nil
}
// Name returns name of the cloud provider.
func (provider *RancherCloudProvider) Name() string {
return providerName
}
// GPULabel returns the label added to nodes with GPU resource.
func (provider *RancherCloudProvider) GPULabel() string {
return ""
}
// GetAvailableGPUTypes return all available GPU types cloud provider supports
func (provider *RancherCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
// TODO: implement GPU support
return nil
}
// NodeGroups returns all node groups configured for this cloud provider.
func (provider *RancherCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
nodeGroups := make([]cloudprovider.NodeGroup, len(provider.nodeGroups))
for i, ng := range provider.nodeGroups {
nodeGroups[i] = ng
}
return nodeGroups
}
// Pricing returns pricing model for this cloud provider or error if not available.
func (provider *RancherCloudProvider) Pricing() (cloudprovider.PricingModel, autoscalererrors.AutoscalerError) {
return nil, cloudprovider.ErrNotImplemented
}
// NodeGroupForNode returns the node group for the given node.
func (provider *RancherCloudProvider) NodeGroupForNode(node *corev1.Node) (cloudprovider.NodeGroup, error) {
// skip nodes that are not managed by rke2.
if !strings.HasPrefix(node.Spec.ProviderID, rke2ProviderID) {
return nil, nil
}
for _, group := range provider.nodeGroups {
// the node name is expected to have the following format:
// <cluster-name>-<node group name>-<random int>-<random string>
// so we trim the cluster name and then cut off the last two parts to
// leave us with the node group name
parts := strings.Split(strings.TrimPrefix(node.Name, provider.config.ClusterName), "-")
if len(parts) < 4 {
return nil, fmt.Errorf("unable to get node group name out of node %s: unexpected node name format", node.Name)
}
groupName := strings.Join(parts[1:len(parts)-2], "-")
if group.name == groupName {
return group, nil
}
}
// if node is not in one of our scalable nodeGroups, we return nil so it
// won't be processed further by the CA.
return nil, nil
}
// GetAvailableMachineTypes get all machine types that can be requested from the cloud provider.
// Implementation optional.
func (provider *RancherCloudProvider) GetAvailableMachineTypes() ([]string, error) {
return []string{}, cloudprovider.ErrNotImplemented
}
// NewNodeGroup builds a theoretical node group based on the node definition provided.
func (provider *RancherCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
taints []corev1.Taint,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
func (provider *RancherCloudProvider) GetResourceLimiter() (*cloudprovider.ResourceLimiter, error) {
return provider.resourceLimiter, nil
}
// Refresh is called before every main loop and can be used to dynamically update cloud provider state.
// In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
func (provider *RancherCloudProvider) Refresh() error {
nodeGroups, err := provider.scalableNodeGroups()
if err != nil {
return fmt.Errorf("unable to get node groups from cluster: %w", err)
}
provider.nodeGroups = nodeGroups
return nil
}
// Cleanup cleans up all resources before the cloud provider is removed
func (provider *RancherCloudProvider) Cleanup() error {
return nil
}
func (provider *RancherCloudProvider) scalableNodeGroups() ([]*nodeGroup, error) {
var result []*nodeGroup
pools, err := provider.getMachinePools()
if err != nil {
return nil, err
}
for _, pool := range pools {
nodeGroup, err := newNodeGroupFromMachinePool(provider, pool)
if err != nil {
if isNotScalable(err) {
klog.V(4).Infof("ignoring machine pool %s as it does not have min/max annotations", pool.Name)
continue
}
return nil, fmt.Errorf("error getting node group from machine pool: %w", err)
}
klog.V(4).Infof("scalable node group found: %s", nodeGroup.Debug())
result = append(result, nodeGroup)
}
return result, err
}
func clusterGVR() schema.GroupVersionResource {
return schema.GroupVersionResource{
Group: rancherProvisioningGroup,
Version: rancherProvisioningVersion,
Resource: "clusters",
}
}
func (provider *RancherCloudProvider) getMachinePools() ([]provisioningv1.RKEMachinePool, error) {
res, err := provider.client.Resource(clusterGVR()).
Namespace(provider.config.ClusterNamespace).
Get(context.TODO(), provider.config.ClusterName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("error getting cluster: %w", err)
}
machinePools, ok, err := unstructured.NestedFieldNoCopy(res.Object, "spec", "rkeConfig", "machinePools")
if !ok {
return nil, fmt.Errorf("unable to find machinePools of cluster %s", provider.config.ClusterName)
}
if err != nil {
return nil, err
}
data, err := json.Marshal(machinePools)
if err != nil {
return nil, err
}
var pools []provisioningv1.RKEMachinePool
err = json.Unmarshal(data, &pools)
return pools, err
}
func (provider *RancherCloudProvider) updateMachinePools(machinePools []provisioningv1.RKEMachinePool) error {
cluster, err := provider.client.Resource(clusterGVR()).
Namespace(provider.config.ClusterNamespace).
Get(context.TODO(), provider.config.ClusterName, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("error getting cluster: %w", err)
}
pools, err := machinePoolsToUnstructured(machinePools)
if err != nil {
return err
}
if err := unstructured.SetNestedSlice(cluster.Object, pools, "spec", "rkeConfig", "machinePools"); err != nil {
return err
}
_, err = provider.client.Resource(clusterGVR()).Namespace(provider.config.ClusterNamespace).
Update(context.TODO(), &unstructured.Unstructured{Object: cluster.Object}, metav1.UpdateOptions{})
return err
}
// converts machinePools into a usable form for the unstructured client.
// unstructured.SetNestedSlice expects types produced by json.Unmarshal(),
// so we marshal and unmarshal again before passing it on.
func machinePoolsToUnstructured(machinePools []provisioningv1.RKEMachinePool) ([]interface{}, error) {
data, err := json.Marshal(machinePools)
if err != nil {
return nil, err
}
var pools []interface{}
if err := json.Unmarshal(data, &pools); err != nil {
return nil, err
}
return pools, nil
}
func isNotScalable(err error) bool {
return errors.Is(err, errMissingMinSizeAnnotation) || errors.Is(err, errMissingMaxSizeAnnotation)
}

View File

@ -0,0 +1,213 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rancher
import (
"strings"
"testing"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
provisioningv1 "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher/provisioning.cattle.io/v1"
fakedynamic "k8s.io/client-go/dynamic/fake"
"k8s.io/utils/pointer"
)
func TestNodeGroups(t *testing.T) {
tests := []struct {
name string
machinePools []provisioningv1.RKEMachinePool
expectedGroups int
expectedErrContains string
expectedResources corev1.ResourceList
clusterNameOverride string
}{
{
name: "normal",
machinePools: []provisioningv1.RKEMachinePool{
{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
MachineDeploymentAnnotations: map[string]string{
minSizeAnnotation: "0",
maxSizeAnnotation: "3",
},
},
},
expectedGroups: 1,
},
{
name: "without size annotations",
machinePools: []provisioningv1.RKEMachinePool{
{
Name: nodeGroupDev,
Quantity: pointer.Int32(1),
},
},
expectedGroups: 0,
},
{
name: "missing quantity",
expectedGroups: 0,
expectedErrContains: "machine pool quantity is not set",
},
{
name: "missing cluster",
expectedGroups: 0,
expectedErrContains: "clusters.provisioning.cattle.io \"some-other-cluster\" not found",
clusterNameOverride: "some-other-cluster",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
pools, err := machinePoolsToUnstructured(tc.machinePools)
if err != nil {
t.Fatal(err)
}
config := &cloudConfig{
ClusterName: "test-cluster",
ClusterNamespace: "default",
}
cluster := newCluster(config.ClusterName, config.ClusterNamespace, pools)
if tc.clusterNameOverride != "" {
config.ClusterName = tc.clusterNameOverride
}
provider := RancherCloudProvider{
resourceLimiter: &cloudprovider.ResourceLimiter{},
client: fakedynamic.NewSimpleDynamicClientWithCustomListKinds(
runtime.NewScheme(),
map[schema.GroupVersionResource]string{
clusterGVR(): "kindList",
},
cluster,
),
config: config,
}
if err := provider.Refresh(); err != nil {
if tc.expectedErrContains == "" || !strings.Contains(err.Error(), tc.expectedErrContains) {
t.Fatalf("expected err to contain %q, got %q", tc.expectedErrContains, err)
}
}
if len(provider.NodeGroups()) != tc.expectedGroups {
t.Fatalf("expected %q groups, got %q", tc.expectedGroups, len(provider.NodeGroups()))
}
})
}
}
func TestNodeGroupForNode(t *testing.T) {
provider, err := setup(nil)
if err != nil {
t.Fatal(err)
}
if err := provider.Refresh(); err != nil {
t.Fatal(err)
}
tests := []struct {
name string
node *corev1.Node
nodeGroupId string
expectedNodeGroupNil bool
}{
{
name: "match dev",
node: &corev1.Node{
ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 0)},
Spec: corev1.NodeSpec{
ProviderID: rke2ProviderIDPrefix + nodeName(nodeGroupDev, 0),
},
},
nodeGroupId: nodeGroupDev,
},
{
name: "match prod",
node: &corev1.Node{
ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupProd, 0)},
Spec: corev1.NodeSpec{
ProviderID: rke2ProviderIDPrefix + nodeName(nodeGroupProd, 0),
},
},
nodeGroupId: nodeGroupProd,
},
{
name: "not rke2 node",
node: &corev1.Node{
ObjectMeta: v1.ObjectMeta{Name: nodeName(nodeGroupDev, 0)},
Spec: corev1.NodeSpec{
ProviderID: "whatever://" + nodeName(nodeGroupDev, 0),
},
},
expectedNodeGroupNil: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ng, err := provider.NodeGroupForNode(tc.node)
if err != nil {
t.Fatal(err)
}
if !tc.expectedNodeGroupNil {
if ng == nil {
t.Fatalf("expected node group from node %s", tc.node.Name)
}
if tc.nodeGroupId != ng.Id() {
t.Fatalf("expected node group id %s, got %s", tc.nodeGroupId, ng.Id())
}
} else {
if ng != nil {
t.Fatalf("expected node group to be nil, got %v", ng)
}
}
})
}
}
func newCluster(name, namespace string, machinePools interface{}) *unstructured.Unstructured {
return &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": "Cluster",
"apiVersion": rancherProvisioningGroup + "/" + rancherProvisioningVersion,
"metadata": map[string]interface{}{
"name": name,
"namespace": namespace,
},
"spec": map[string]interface{}{
"rkeConfig": map[string]interface{}{
"machinePools": machinePools,
},
},
"status": map[string]interface{}{},
},
}
}