Merge pull request #4237 from DataDog/autoscaling-options-azure
implement GetOptions for Azure
This commit is contained in:
commit
a498045443
|
|
@ -54,6 +54,26 @@ k8s.io_cluster-autoscaler_node-template_resources_cpu: 3800m
|
||||||
k8s.io_cluster-autoscaler_node-template_resources_memory: 11Gi
|
k8s.io_cluster-autoscaler_node-template_resources_memory: 11Gi
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Autoscaling options
|
||||||
|
|
||||||
|
Some autoscaling options can be defined per VM Scale Set, with tags.
|
||||||
|
Those tags values have the format as the respective cluster-autoscaler flags they override: floats or durations encoded as strings.
|
||||||
|
|
||||||
|
Supported options tags (with example values) are:
|
||||||
|
```
|
||||||
|
# overrides --scale-down-utilization-threshold global value for that specific VM Scale Set
|
||||||
|
k8s.io_cluster-autoscaler_node-template_autoscaling-options_scaledownutilizationthreshold: "0.5"
|
||||||
|
|
||||||
|
# overrides --scale-down-gpu-utilization-threshold global value for that specific VM Scale Set
|
||||||
|
k8s.io_cluster-autoscaler_node-template_autoscaling-options_scaledowngpuutilizationthreshold: "0.5"
|
||||||
|
|
||||||
|
# overrides --scale-down-unneeded-time global value for that specific VM Scale Set
|
||||||
|
k8s.io_cluster-autoscaler_node-template_autoscaling-options_scaledownunneededtime: "10m0s"
|
||||||
|
|
||||||
|
# overrides --scale-down-unready-time global value for that specific VM Scale Set
|
||||||
|
k8s.io_cluster-autoscaler_node-template_autoscaling-options_scaledownunreadytime: "20m0s"
|
||||||
|
```
|
||||||
|
|
||||||
## Deployment manifests
|
## Deployment manifests
|
||||||
|
|
||||||
Cluster autoscaler supports four Kubernetes cluster options on Azure:
|
Cluster autoscaler supports four Kubernetes cluster options on Azure:
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||||
package azure
|
package azure
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
@ -53,6 +54,7 @@ type azureCache struct {
|
||||||
registeredNodeGroups []cloudprovider.NodeGroup
|
registeredNodeGroups []cloudprovider.NodeGroup
|
||||||
instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup
|
instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup
|
||||||
unownedInstances map[azureRef]bool
|
unownedInstances map[azureRef]bool
|
||||||
|
autoscalingOptions map[azureRef]map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string) (*azureCache, error) {
|
func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string) (*azureCache, error) {
|
||||||
|
|
@ -67,6 +69,7 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy
|
||||||
registeredNodeGroups: make([]cloudprovider.NodeGroup, 0),
|
registeredNodeGroups: make([]cloudprovider.NodeGroup, 0),
|
||||||
instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup),
|
instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup),
|
||||||
unownedInstances: make(map[azureRef]bool),
|
unownedInstances: make(map[azureRef]bool),
|
||||||
|
autoscalingOptions: make(map[azureRef]map[string]string),
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := cache.regenerate(); err != nil {
|
if err := cache.regenerate(); err != nil {
|
||||||
|
|
@ -117,10 +120,22 @@ func (m *azureCache) regenerate() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Regenerate VMSS to autoscaling options mapping.
|
||||||
|
newAutoscalingOptions := make(map[azureRef]map[string]string)
|
||||||
|
for _, vmss := range m.scaleSets {
|
||||||
|
ref := azureRef{Name: *vmss.Name}
|
||||||
|
options := extractAutoscalingOptionsFromScaleSetTags(vmss.Tags)
|
||||||
|
if !reflect.DeepEqual(m.getAutoscalingOptions(ref), options) {
|
||||||
|
klog.V(4).Infof("Extracted autoscaling options from %q ScaleSet tags: %v", *vmss.Name, options)
|
||||||
|
}
|
||||||
|
newAutoscalingOptions[ref] = options
|
||||||
|
}
|
||||||
|
|
||||||
m.mutex.Lock()
|
m.mutex.Lock()
|
||||||
defer m.mutex.Unlock()
|
defer m.mutex.Unlock()
|
||||||
|
|
||||||
m.instanceToNodeGroup = newInstanceToNodeGroupCache
|
m.instanceToNodeGroup = newInstanceToNodeGroupCache
|
||||||
|
m.autoscalingOptions = newAutoscalingOptions
|
||||||
|
|
||||||
// Reset unowned instances cache.
|
// Reset unowned instances cache.
|
||||||
m.unownedInstances = make(map[azureRef]bool)
|
m.unownedInstances = make(map[azureRef]bool)
|
||||||
|
|
@ -256,6 +271,13 @@ func (m *azureCache) getRegisteredNodeGroups() []cloudprovider.NodeGroup {
|
||||||
return m.registeredNodeGroups
|
return m.registeredNodeGroups
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *azureCache) getAutoscalingOptions(ref azureRef) map[string]string {
|
||||||
|
m.mutex.Lock()
|
||||||
|
defer m.mutex.Unlock()
|
||||||
|
|
||||||
|
return m.autoscalingOptions[ref]
|
||||||
|
}
|
||||||
|
|
||||||
// FindForInstance returns node group of the given Instance
|
// FindForInstance returns node group of the given Instance
|
||||||
func (m *azureCache) FindForInstance(instance *azureRef, vmType string) (cloudprovider.NodeGroup, error) {
|
func (m *azureCache) FindForInstance(instance *azureRef, vmType string) (cloudprovider.NodeGroup, error) {
|
||||||
m.mutex.Lock()
|
m.mutex.Lock()
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ import (
|
||||||
|
|
||||||
"github.com/Azure/go-autorest/autorest/azure"
|
"github.com/Azure/go-autorest/autorest/azure"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||||
|
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
|
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
|
||||||
klog "k8s.io/klog/v2"
|
klog "k8s.io/klog/v2"
|
||||||
)
|
)
|
||||||
|
|
@ -246,6 +247,29 @@ func (m *AzureManager) GetNodeGroupForInstance(instance *azureRef) (cloudprovide
|
||||||
return m.azureCache.FindForInstance(instance, m.config.VMType)
|
return m.azureCache.FindForInstance(instance, m.config.VMType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetScaleSetOptions parse options extracted from VMSS tags and merges them with provided defaults
|
||||||
|
func (m *AzureManager) GetScaleSetOptions(scaleSetName string, defaults config.NodeGroupAutoscalingOptions) *config.NodeGroupAutoscalingOptions {
|
||||||
|
options := m.azureCache.getAutoscalingOptions(azureRef{Name: scaleSetName})
|
||||||
|
if options == nil || len(options) == 0 {
|
||||||
|
return &defaults
|
||||||
|
}
|
||||||
|
|
||||||
|
if opt, ok := getFloat64Option(options, scaleSetName, config.DefaultScaleDownUtilizationThresholdKey); ok {
|
||||||
|
defaults.ScaleDownUtilizationThreshold = opt
|
||||||
|
}
|
||||||
|
if opt, ok := getFloat64Option(options, scaleSetName, config.DefaultScaleDownGpuUtilizationThresholdKey); ok {
|
||||||
|
defaults.ScaleDownGpuUtilizationThreshold = opt
|
||||||
|
}
|
||||||
|
if opt, ok := getDurationOption(options, scaleSetName, config.DefaultScaleDownUnneededTimeKey); ok {
|
||||||
|
defaults.ScaleDownUnneededTime = opt
|
||||||
|
}
|
||||||
|
if opt, ok := getDurationOption(options, scaleSetName, config.DefaultScaleDownUnreadyTimeKey); ok {
|
||||||
|
defaults.ScaleDownUnreadyTime = opt
|
||||||
|
}
|
||||||
|
|
||||||
|
return &defaults
|
||||||
|
}
|
||||||
|
|
||||||
// Cleanup the cache.
|
// Cleanup the cache.
|
||||||
func (m *AzureManager) Cleanup() {
|
func (m *AzureManager) Cleanup() {
|
||||||
m.azureCache.Cleanup()
|
m.azureCache.Cleanup()
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ import (
|
||||||
"github.com/golang/mock/gomock"
|
"github.com/golang/mock/gomock"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||||
|
"k8s.io/autoscaler/cluster-autoscaler/config"
|
||||||
azclients "k8s.io/legacy-cloud-providers/azure/clients"
|
azclients "k8s.io/legacy-cloud-providers/azure/clients"
|
||||||
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
|
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
|
||||||
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
|
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
|
||||||
|
|
@ -791,3 +792,47 @@ func TestManagerRefreshAndCleanup(t *testing.T) {
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
manager.Cleanup()
|
manager.Cleanup()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetScaleSetOptions(t *testing.T) {
|
||||||
|
manager := &AzureManager{
|
||||||
|
azureCache: &azureCache{
|
||||||
|
autoscalingOptions: make(map[azureRef]map[string]string),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
defaultOptions := config.NodeGroupAutoscalingOptions{
|
||||||
|
ScaleDownUtilizationThreshold: 0.1,
|
||||||
|
ScaleDownGpuUtilizationThreshold: 0.2,
|
||||||
|
ScaleDownUnneededTime: time.Second,
|
||||||
|
ScaleDownUnreadyTime: time.Minute,
|
||||||
|
}
|
||||||
|
|
||||||
|
tags := map[string]string{
|
||||||
|
config.DefaultScaleDownUtilizationThresholdKey: "0.2",
|
||||||
|
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.3",
|
||||||
|
config.DefaultScaleDownUnneededTimeKey: "30m",
|
||||||
|
config.DefaultScaleDownUnreadyTimeKey: "1h",
|
||||||
|
}
|
||||||
|
manager.azureCache.autoscalingOptions[azureRef{Name: "test1"}] = tags
|
||||||
|
opts := manager.GetScaleSetOptions("test1", defaultOptions)
|
||||||
|
assert.Equal(t, opts.ScaleDownUtilizationThreshold, 0.2)
|
||||||
|
assert.Equal(t, opts.ScaleDownGpuUtilizationThreshold, 0.3)
|
||||||
|
assert.Equal(t, opts.ScaleDownUnneededTime, 30*time.Minute)
|
||||||
|
assert.Equal(t, opts.ScaleDownUnreadyTime, time.Hour)
|
||||||
|
|
||||||
|
tags = map[string]string{
|
||||||
|
//config.DefaultScaleDownUtilizationThresholdKey: ... // not specified (-> default)
|
||||||
|
config.DefaultScaleDownGpuUtilizationThresholdKey: "not-a-float",
|
||||||
|
config.DefaultScaleDownUnneededTimeKey: "1m",
|
||||||
|
config.DefaultScaleDownUnreadyTimeKey: "not-a-duration",
|
||||||
|
}
|
||||||
|
manager.azureCache.autoscalingOptions[azureRef{Name: "test2"}] = tags
|
||||||
|
opts = manager.GetScaleSetOptions("test2", defaultOptions)
|
||||||
|
assert.Equal(t, opts.ScaleDownUtilizationThreshold, defaultOptions.ScaleDownUtilizationThreshold)
|
||||||
|
assert.Equal(t, opts.ScaleDownGpuUtilizationThreshold, defaultOptions.ScaleDownGpuUtilizationThreshold)
|
||||||
|
assert.Equal(t, opts.ScaleDownUnneededTime, time.Minute)
|
||||||
|
assert.Equal(t, opts.ScaleDownUnreadyTime, defaultOptions.ScaleDownUnreadyTime)
|
||||||
|
|
||||||
|
manager.azureCache.autoscalingOptions[azureRef{Name: "test3"}] = map[string]string{}
|
||||||
|
opts = manager.GetScaleSetOptions("test3", defaultOptions)
|
||||||
|
assert.Equal(t, *opts, defaultOptions)
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -116,7 +116,11 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool {
|
||||||
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
|
// GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
|
||||||
// NodeGroup. Returning a nil will result in using default options.
|
// NodeGroup. Returning a nil will result in using default options.
|
||||||
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
|
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
|
||||||
return nil, cloudprovider.ErrNotImplemented
|
template, err := scaleSet.getVMSSFromCache()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err.Error()
|
||||||
|
}
|
||||||
|
return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// MaxSize returns maximum size of the node group.
|
// MaxSize returns maximum size of the node group.
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,9 @@ import (
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func buildInstanceOS(template compute.VirtualMachineScaleSet) string {
|
func buildInstanceOS(template compute.VirtualMachineScaleSet) string {
|
||||||
|
|
@ -183,6 +185,53 @@ func extractTaintsFromScaleSet(tags map[string]*string) []apiv1.Taint {
|
||||||
return taints
|
return taints
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func extractAutoscalingOptionsFromScaleSetTags(tags map[string]*string) map[string]string {
|
||||||
|
options := make(map[string]string)
|
||||||
|
for tagName, tagValue := range tags {
|
||||||
|
if !strings.HasPrefix(tagName, nodeOptionsTagName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
resourceName := strings.Split(tagName, nodeOptionsTagName)
|
||||||
|
if len(resourceName) < 2 || resourceName[1] == "" || tagValue == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
options[resourceName[1]] = strings.ToLower(*tagValue)
|
||||||
|
}
|
||||||
|
return options
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFloat64Option(options map[string]string, vmssName, name string) (float64, bool) {
|
||||||
|
raw, ok := options[strings.ToLower(name)]
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
option, err := strconv.ParseFloat(raw, 64)
|
||||||
|
if err != nil {
|
||||||
|
klog.Warningf("failed to convert VMSS %q tag %s_%s value %q to float: %v",
|
||||||
|
vmssName, nodeOptionsTagName, name, raw, err)
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return option, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func getDurationOption(options map[string]string, vmssName, name string) (time.Duration, bool) {
|
||||||
|
raw, ok := options[strings.ToLower(name)]
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
option, err := time.ParseDuration(raw)
|
||||||
|
if err != nil {
|
||||||
|
klog.Warningf("failed to convert VMSS %q tag %s_%s value %q to duration: %v",
|
||||||
|
vmssName, nodeOptionsTagName, name, raw, err)
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return option, true
|
||||||
|
}
|
||||||
|
|
||||||
func extractAllocatableResourcesFromScaleSet(tags map[string]*string) map[string]*resource.Quantity {
|
func extractAllocatableResourcesFromScaleSet(tags map[string]*string) map[string]*resource.Quantity {
|
||||||
resources := make(map[string]*resource.Quantity)
|
resources := make(map[string]*resource.Quantity)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,7 @@ const (
|
||||||
nodeLabelTagName = "k8s.io_cluster-autoscaler_node-template_label_"
|
nodeLabelTagName = "k8s.io_cluster-autoscaler_node-template_label_"
|
||||||
nodeTaintTagName = "k8s.io_cluster-autoscaler_node-template_taint_"
|
nodeTaintTagName = "k8s.io_cluster-autoscaler_node-template_taint_"
|
||||||
nodeResourcesTagName = "k8s.io_cluster-autoscaler_node-template_resources_"
|
nodeResourcesTagName = "k8s.io_cluster-autoscaler_node-template_resources_"
|
||||||
|
nodeOptionsTagName = "k8s.io_cluster-autoscaler_node-template_autoscaling-options_"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue