autoscaler/cluster-autoscaler/cloudprovider/azure/azure_template.go

434 lines
15 KiB
Go

/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package azure
import (
"fmt"
"math/rand"
"regexp"
"strconv"
"strings"
"time"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/klog/v2"
kubeletapis "k8s.io/kubelet/pkg/apis"
)
const (
// AKSLabelPrefixValue represents the constant prefix for AKSLabelKeyPrefixValue
AKSLabelPrefixValue = "kubernetes.azure.com"
// AKSLabelKeyPrefixValue represents prefix for AKS Labels
AKSLabelKeyPrefixValue = AKSLabelPrefixValue + "/"
azureDiskTopologyKey = "topology.disk.csi.azure.com/zone"
// For NP-series SKU, the xilinx device plugin uses that resource name
// https://github.com/Xilinx/FPGA_as_a_Service/tree/master/k8s-fpga-device-plugin
xilinxFpgaResourceName = "xilinx.com/fpga-xilinx_u250_gen3x16_xdma_shell_2_1-0"
// legacyPoolNameTag is the legacy tag that AKS adds to the VMSS with its value
// being the agentpool name
legacyPoolNameTag = "poolName"
// poolNameTag is the new tag that replaces the above one
// Newly created pools and clusters will have this one on the VMSS
// instead of the legacy one. We'll have to live with both tags for a while.
poolNameTag = "aks-managed-poolName"
// This is the legacy label is added by agentbaker, agentpool={poolName} and we want to predict that
// a node added to this agentpool will have this as a node label. The value is fetched
// from the VMSS tag with key poolNameTag/legacyPoolNameTag
legacyAgentPoolNodeLabelKey = "agentpool"
// New label that replaces the above
agentPoolNodeLabelKey = AKSLabelKeyPrefixValue + "agentpool"
// Storage profile node labels
legacyStorageProfileNodeLabelKey = "storageprofile"
storageProfileNodeLabelKey = AKSLabelKeyPrefixValue + "storageprofile"
// Storage tier node labels
legacyStorageTierNodeLabelKey = "storagetier"
storageTierNodeLabelKey = AKSLabelKeyPrefixValue + "storagetier"
// Fips node label
fipsNodeLabelKey = AKSLabelKeyPrefixValue + "fips_enabled"
// OS Sku node Label
osSkuLabelKey = AKSLabelKeyPrefixValue + "os-sku"
// Security node label
securityTypeLabelKey = AKSLabelKeyPrefixValue + "security-type"
customCATrustEnabledLabelKey = AKSLabelKeyPrefixValue + "custom-ca-trust-enabled"
kataMshvVMIsolationLabelKey = AKSLabelKeyPrefixValue + "kata-mshv-vm-isolation"
// Cluster node label
clusterLabelKey = AKSLabelKeyPrefixValue + "cluster"
)
func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, inputTaints string,
template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) {
node := apiv1.Node{}
nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63())
node.ObjectMeta = metav1.ObjectMeta{
Name: nodeName,
SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName),
Labels: map[string]string{},
}
node.Status = apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
}
var vcpu, gpuCount, memoryMb int64
// Fetching SKU information from SKU API if enableDynamicInstanceList is true.
var dynamicErr error
if enableDynamicInstanceList {
var vmssTypeDynamic InstanceType
klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name)
vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, manager.azureCache)
if dynamicErr == nil {
vcpu = vmssTypeDynamic.VCPU
gpuCount = vmssTypeDynamic.GPU
memoryMb = vmssTypeDynamic.MemoryMb
} else {
klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr)
}
}
if !enableDynamicInstanceList || dynamicErr != nil {
klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name)
// fall-back on static list of vmss if dynamic workflow fails.
vmssTypeStatic, staticErr := GetVMSSTypeStatically(template)
if staticErr == nil {
vcpu = vmssTypeStatic.VCPU
gpuCount = vmssTypeStatic.GPU
memoryMb = vmssTypeStatic.MemoryMb
} else {
// return error if neither of the workflows results with vmss data.
klog.V(1).Infof("Instance type %q not supported, err: %v", *template.Sku.Name, staticErr)
return nil, staticErr
}
}
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(vcpu, resource.DecimalSI)
// isNPSeries returns if a SKU is an NP-series SKU
// SKU API reports GPUs for NP-series but it's actually FPGAs
if isNPSeries(*template.Sku.Name) {
node.Status.Capacity[xilinxFpgaResourceName] = *resource.NewQuantity(gpuCount, resource.DecimalSI)
} else {
node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(gpuCount, resource.DecimalSI)
}
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(memoryMb*1024*1024, resource.DecimalSI)
// TODO: set real allocatable.
node.Status.Allocatable = node.Status.Capacity
// NodeLabels
if template.Tags != nil {
for k, v := range template.Tags {
if v != nil {
node.Labels[k] = *v
} else {
node.Labels[k] = ""
}
}
}
// GenericLabels
node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabels(template, nodeName))
// Labels from the Scale Set's Tags
labels := make(map[string]string)
// Prefer the explicit labels in spec coming from RP over the VMSS template
if len(inputLabels) > 0 {
labels = inputLabels
} else {
labels = extractLabelsFromScaleSet(template.Tags)
}
// Add the agentpool label, its value should come from the VMSS poolName tag
// NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one
// We will have to live with both labels for a while
if node.Labels[legacyPoolNameTag] != "" {
labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
}
if node.Labels[poolNameTag] != "" {
labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag]
labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag]
}
// Add the storage profile and storage tier labels
if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.StorageProfile != nil && template.VirtualMachineProfile.StorageProfile.OsDisk != nil {
// ephemeral
if template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings != nil && template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings.Option == compute.Local {
labels[legacyStorageProfileNodeLabelKey] = "ephemeral"
labels[storageProfileNodeLabelKey] = "ephemeral"
} else {
labels[legacyStorageProfileNodeLabelKey] = "managed"
labels[storageProfileNodeLabelKey] = "managed"
}
if template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk != nil {
labels[legacyStorageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType)
labels[storageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType)
}
// Add ephemeral-storage value
if template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB != nil {
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI)
klog.V(4).Infof("OS Disk Size from template is: %d", *template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB)
klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage])
}
}
// If we are on GPU-enabled SKUs, append the accelerator
// label so that CA makes better decision when scaling from zero for GPU pools
if isNvidiaEnabledSKU(*template.Sku.Name) {
labels[GPULabel] = "nvidia"
labels[legacyGPULabel] = "nvidia"
}
// Extract allocatables from tags
resourcesFromTags := extractAllocatableResourcesFromScaleSet(template.Tags)
for resourceName, val := range resourcesFromTags {
node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val
}
node.Labels = cloudprovider.JoinStringMaps(node.Labels, labels)
klog.V(4).Infof("Setting node %s labels to: %s", nodeName, node.Labels)
var taints []apiv1.Taint
// Prefer the explicit taints in spec over the VMSS template
if inputTaints != "" {
taints = extractTaintsFromSpecString(inputTaints)
} else {
taints = extractTaintsFromScaleSet(template.Tags)
}
// Taints from the Scale Set's Tags
node.Spec.Taints = taints
klog.V(4).Infof("Setting node %s taints to: %s", nodeName, node.Spec.Taints)
node.Status.Conditions = cloudprovider.BuildReadyConditions()
return &node, nil
}
func buildInstanceOS(template compute.VirtualMachineScaleSet) string {
instanceOS := cloudprovider.DefaultOS
if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.OsProfile != nil && template.VirtualMachineProfile.OsProfile.WindowsConfiguration != nil {
instanceOS = "windows"
}
return instanceOS
}
func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string) map[string]string {
result := make(map[string]string)
result[kubeletapis.LabelArch] = cloudprovider.DefaultArch
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
result[kubeletapis.LabelOS] = buildInstanceOS(template)
result[apiv1.LabelOSStable] = buildInstanceOS(template)
result[apiv1.LabelInstanceType] = *template.Sku.Name
result[apiv1.LabelInstanceTypeStable] = *template.Sku.Name
result[apiv1.LabelZoneRegion] = strings.ToLower(*template.Location)
result[apiv1.LabelTopologyRegion] = strings.ToLower(*template.Location)
if template.Zones != nil && len(*template.Zones) > 0 {
failureDomains := make([]string, len(*template.Zones))
for k, v := range *template.Zones {
failureDomains[k] = strings.ToLower(*template.Location) + "-" + v
}
//Picks random zones for Multi-zone nodepool when scaling from zero.
//This random zone will not be the same as the zone of the VMSS that is being created, the purpose of creating
//the node template with random zone is to initiate scaling from zero on the multi-zone nodepool.
//Note that the if the customer is to have some pod affinity picking exact zone, this logic won't work.
//For now, discourage the customers from using podAffinity to pick the availability zones.
randomZone := failureDomains[rand.Intn(len(failureDomains))]
result[apiv1.LabelZoneFailureDomain] = randomZone
result[apiv1.LabelTopologyZone] = randomZone
result[azureDiskTopologyKey] = randomZone
} else {
result[apiv1.LabelZoneFailureDomain] = "0"
result[apiv1.LabelTopologyZone] = "0"
result[azureDiskTopologyKey] = ""
}
result[apiv1.LabelHostname] = nodeName
return result
}
func extractLabelsFromScaleSet(tags map[string]*string) map[string]string {
result := make(map[string]string)
for tagName, tagValue := range tags {
splits := strings.Split(tagName, nodeLabelTagName)
if len(splits) > 1 {
label := strings.Replace(splits[1], "_", "/", -1)
label = strings.Replace(label, "~2", "_", -1)
if label != "" {
result[label] = *tagValue
}
}
}
return result
}
func extractTaintsFromScaleSet(tags map[string]*string) []apiv1.Taint {
taints := make([]apiv1.Taint, 0)
for tagName, tagValue := range tags {
// The tag value must be in the format <tag>:NoSchedule
r, _ := regexp.Compile("(.*):(?:NoSchedule|NoExecute|PreferNoSchedule)")
if r.MatchString(*tagValue) {
splits := strings.Split(tagName, nodeTaintTagName)
if len(splits) > 1 {
values := strings.SplitN(*tagValue, ":", 2)
if len(values) > 1 {
taintKey := strings.Replace(splits[1], "_", "/", -1)
taintKey = strings.Replace(taintKey, "~2", "_", -1)
taints = append(taints, apiv1.Taint{
Key: taintKey,
Value: values[0],
Effect: apiv1.TaintEffect(values[1]),
})
}
}
}
}
return taints
}
// Example of a valid taints string, is the same argument to kubelet's `--register-with-taints`
// "dedicated=foo:NoSchedule,group=bar:NoExecute,app=fizz:PreferNoSchedule"
func extractTaintsFromSpecString(taintsString string) []apiv1.Taint {
taints := make([]apiv1.Taint, 0)
// First split the taints at the separator
splits := strings.Split(taintsString, ",")
for _, split := range splits {
taintSplit := strings.Split(split, "=")
if len(taintSplit) != 2 {
continue
}
taintKey := taintSplit[0]
taintValue := taintSplit[1]
r, _ := regexp.Compile("(.*):(?:NoSchedule|NoExecute|PreferNoSchedule)")
if !r.MatchString(taintValue) {
continue
}
values := strings.SplitN(taintValue, ":", 2)
taints = append(taints, apiv1.Taint{
Key: taintKey,
Value: values[0],
Effect: apiv1.TaintEffect(values[1]),
})
}
return taints
}
func extractAutoscalingOptionsFromScaleSetTags(tags map[string]*string) map[string]string {
options := make(map[string]string)
for tagName, tagValue := range tags {
if !strings.HasPrefix(tagName, nodeOptionsTagName) {
continue
}
resourceName := strings.Split(tagName, nodeOptionsTagName)
if len(resourceName) < 2 || resourceName[1] == "" || tagValue == nil {
continue
}
options[resourceName[1]] = strings.ToLower(*tagValue)
}
return options
}
func getFloat64Option(options map[string]string, vmssName, name string) (float64, bool) {
raw, ok := options[strings.ToLower(name)]
if !ok {
return 0, false
}
option, err := strconv.ParseFloat(raw, 64)
if err != nil {
klog.Warningf("failed to convert VMSS %q tag %s_%s value %q to float: %v",
vmssName, nodeOptionsTagName, name, raw, err)
return 0, false
}
return option, true
}
func getDurationOption(options map[string]string, vmssName, name string) (time.Duration, bool) {
raw, ok := options[strings.ToLower(name)]
if !ok {
return 0, false
}
option, err := time.ParseDuration(raw)
if err != nil {
klog.Warningf("failed to convert VMSS %q tag %s_%s value %q to duration: %v",
vmssName, nodeOptionsTagName, name, raw, err)
return 0, false
}
return option, true
}
func extractAllocatableResourcesFromScaleSet(tags map[string]*string) map[string]*resource.Quantity {
resources := make(map[string]*resource.Quantity)
for tagName, tagValue := range tags {
resourceName := strings.Split(tagName, nodeResourcesTagName)
if len(resourceName) < 2 || resourceName[1] == "" {
continue
}
normalizedResourceName := strings.Replace(resourceName[1], "_", "/", -1)
normalizedResourceName = strings.Replace(normalizedResourceName, "~2", "/", -1)
quantity, err := resource.ParseQuantity(*tagValue)
if err != nil {
continue
}
resources[normalizedResourceName] = &quantity
}
return resources
}
// isNPSeries returns if a SKU is an NP-series SKU
// SKU API reports GPUs for NP-series but it's actually FPGAs
func isNPSeries(name string) bool {
return strings.HasPrefix(strings.ToLower(name), "standard_np")
}