autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud_manager.go

632 lines
19 KiB
Go

/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tencentcloud
import (
"encoding/json"
"fmt"
"io"
"math/rand"
"os"
"sync"
"time"
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
as "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud-sdk-go/tencentcloud/as/v20180419"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud-sdk-go/tencentcloud/common"
cvm "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud-sdk-go/tencentcloud/cvm/v20170312"
tke "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud-sdk-go/tencentcloud/tke/v20180525"
vpc "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud/tencentcloud-sdk-go/tencentcloud/vpc/v20170312"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
)
const (
retryCountStop = 5
intervalTimeStop = 5 * time.Second
tokenExpiredTime = 7200
serviceName = "cluster-autoscaler"
refreshInterval = 1 * time.Minute
scaleToZeroSupported = true
)
// network extended resources
const (
TKERouteENIIP = "tke.cloud.tencent.com/eni-ip"
TKEDirectENI = "tke.cloud.tencent.com/direct-eni"
)
// vcuda resources
const (
VCudaCore = "tencent.com/vcuda-core"
VCudaMemory = "tencent.com/vcuda-memory"
)
// GPUMemoryMap is a coefficient to get gpu extended resources
var (
GPUMemoryMap = map[string]int64{
"GN10X": 32,
"GN10S": 16,
"GN10": 16,
"GN8": 24,
"GN7": 16,
"GN6": 8,
"GN6S": 8,
"GN2": 24,
"GN7vw": 16,
"GC1": 11,
}
)
// TencentcloudManager is handles tencentcloud communication and data caching.
type TencentcloudManager interface {
// Refresh triggers refresh of cached resources.
Refresh() error
// Cleanup cleans up open resources before the cloud provider is destroyed, i.e. go routines etc.
Cleanup() error
RegisterAsg(asg Asg)
// GetAsgs returns list of registered Asgs.
GetAsgs() []Asg
// GetAsgNodes returns Asg nodes.
GetAsgNodes(Asg Asg) ([]cloudprovider.Instance, error)
// GetAsgForInstance returns Asg to which the given instance belongs.
GetAsgForInstance(instance TcRef) (Asg, error)
// GetAsgTemplateNode returns a template node for Asg.
GetAsgTemplateNode(Asg Asg) (*apiv1.Node, error)
// GetResourceLimiter returns resource limiter.
GetResourceLimiter() (*cloudprovider.ResourceLimiter, error)
// GetAsgSize gets Asg size.
GetAsgSize(Asg Asg) (int64, error)
// SetAsgSize sets Asg size.
SetAsgSize(Asg Asg, size int64) error
// DeleteInstances deletes the given instances. All instances must be controlled by the same Asg.
DeleteInstances(instances []TcRef) error
}
type tencentcloudManagerImpl struct {
mutex sync.Mutex
lastRefresh time.Time
cloudService CloudService
cache *TencentcloudCache
regional bool
explicitlyConfigured map[TcRef]bool
interrupt chan struct{}
}
// CloudConfig represent tencentcloud configuration
type CloudConfig struct {
Region string `json:"region"`
RegionName string `json:"regionName"`
Zone string `json:"zone"`
DryRun bool `json:dryRun`
SecretID string
SecretKey string
ClusterID string
IsTest bool
}
// LabelAutoScalingGroupID represents the label of AutoScalingGroup
const LabelAutoScalingGroupID = "cloud.tencent.com/auto-scaling-group-id"
var cloudConfig CloudConfig
func readCloudConfig(configReader io.Reader) error {
if configReader == nil {
return fmt.Errorf("tencentcloud cloud config is not exists")
}
if err := json.NewDecoder(configReader).Decode(&cloudConfig); err != nil {
return err
}
testEnv := os.Getenv("TEST_ENV")
if testEnv == "true" {
cloudConfig.IsTest = true
}
dryRun := os.Getenv("DRY_RUN")
if dryRun == "true" {
cloudConfig.DryRun = true
}
secretID := os.Getenv("SECRET_ID")
secretKey := os.Getenv("SECRET_KEY")
region := os.Getenv("REGION")
regionName := os.Getenv("REGION_NAME")
clusterID := os.Getenv("CLUSTER_ID")
if secretID == "" {
return fmt.Errorf("please specify the environment variable: SECRET_ID")
}
if secretKey == "" {
return fmt.Errorf("please specify the environment variable: SECRET_KEY")
}
if region == "" {
return fmt.Errorf("please specify the environment variable: REGION")
}
if regionName == "" {
return fmt.Errorf("please specify the environment variable: REGION_NAME")
}
if clusterID == "" {
return fmt.Errorf("please specify the environment variable: CLUSTER_ID")
}
cloudConfig.SecretID = secretID
cloudConfig.SecretKey = secretKey
cloudConfig.Region = region
cloudConfig.RegionName = regionName
cloudConfig.ClusterID = clusterID
klog.V(4).Infof("tencentcloud config %+v", cloudConfig)
return nil
}
// CreateTencentcloudManager constructs tencentcloudManager object.
func CreateTencentcloudManager(configReader io.Reader, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, regional bool) (TencentcloudManager, error) {
err := readCloudConfig(configReader)
if err != nil {
return nil, err
}
credential := common.NewCredential(cloudConfig.SecretID, cloudConfig.SecretKey)
cvmClient, err := cvm.NewClient(credential, cloudConfig.RegionName, newCVMClientProfile())
if err != nil {
return nil, err
}
vpcClient, err := vpc.NewClient(credential, cloudConfig.RegionName, newVPCClientProfile())
if err != nil {
return nil, err
}
asClient, err := as.NewClient(credential, cloudConfig.RegionName, newASClientProfile())
if err != nil {
return nil, err
}
tkeClient, err := tke.NewClient(credential, cloudConfig.RegionName, newTKEClientProfile())
if err != nil {
return nil, err
}
var service CloudService
if cloudConfig.DryRun {
service = NewCloudMockService(cvmClient, vpcClient, asClient, tkeClient)
} else {
service = NewCloudService(cvmClient, vpcClient, asClient, tkeClient)
}
manager := &tencentcloudManagerImpl{
cache: NewTencentcloudCache(service),
cloudService: service,
regional: regional,
interrupt: make(chan struct{}),
explicitlyConfigured: make(map[TcRef]bool),
}
if err := manager.fetchExplicitAsgs(discoveryOpts.NodeGroupSpecs); err != nil {
return nil, fmt.Errorf("failed to fetch ASGs: %v", err)
}
go wait.Until(func() {
if err := manager.cache.RegenerateInstancesCache(); err != nil {
klog.Errorf("Error while regenerating Mig cache: %v", err)
}
}, time.Hour, manager.interrupt)
return manager, nil
}
// Fetch explicitly configured ASGs. These ASGs should never be unregistered
// during refreshes, even if they no longer exist in Tencentcloud.
func (m *tencentcloudManagerImpl) fetchExplicitAsgs(specs []string) error {
changed := false
for _, spec := range specs {
asg, err := m.buildAsgFromFlag(spec)
if err != nil {
return err
}
if m.registerAsg(asg) {
changed = true
}
m.explicitlyConfigured[asg.TencentcloudRef()] = true
}
if changed {
err := m.cache.RegenerateAutoScalingGroupCache()
if err != nil {
return err
}
err = m.cache.RegenerateInstancesCache()
if err != nil {
return err
}
for _, asg := range m.cache.GetAsgs() {
// Try to build a node from template to validate that this group
// can be scaled up from 0 nodes.
// We may never need to do it, so just log error if it fails.
if _, err := m.GetAsgTemplateNode(asg); err != nil {
klog.Errorf("Can't build node from template for %s, won't be able to scale from 0: %v", asg.TencentcloudRef().String(), err)
}
}
}
return nil
}
// GetResourceLimiter() (*cloudprovider.ResourceLimiter, error)
func (m *tencentcloudManagerImpl) GetResourceLimiter() (*cloudprovider.ResourceLimiter, error) {
return m.cache.GetResourceLimiter()
}
// registerAsg registers asg in TencentcloudManager. Returns true if the node group didn't exist before or its config has changed.
func (m *tencentcloudManagerImpl) registerAsg(asg Asg) bool {
return m.cache.RegisterAsg(asg)
}
func (m *tencentcloudManagerImpl) buildAsgFromFlag(flag string) (Asg, error) {
s, err := dynamic.SpecFromString(flag, scaleToZeroSupported)
if err != nil {
return nil, fmt.Errorf("failed to parse node group spec: %v", err)
}
return m.buildAsgFromSpec(s)
}
func (m *tencentcloudManagerImpl) buildAsgFromSpec(s *dynamic.NodeGroupSpec) (Asg, error) {
return &tcAsg{
tencentcloudManager: m,
minSize: s.MinSize,
maxSize: s.MaxSize,
tencentcloudRef: TcRef{
ID: s.Name,
},
}, nil
}
// Refresh triggers refresh of cached resources.
func (m *tencentcloudManagerImpl) Refresh() error {
m.cache.InvalidateAllAsgTargetSizes()
if m.lastRefresh.Add(refreshInterval).After(time.Now()) {
return nil
}
return m.forceRefresh()
}
func (m *tencentcloudManagerImpl) forceRefresh() error {
// TODO refresh
m.lastRefresh = time.Now()
klog.V(2).Infof("Refreshed Tencentcloud resources, next refresh after %v", m.lastRefresh.Add(refreshInterval))
return nil
}
// GetAsgs returns list of registered ASGs.
func (m *tencentcloudManagerImpl) GetAsgs() []Asg {
return m.cache.GetAsgs()
}
// RegisterAsg registers asg in Tencentcloud Manager.
func (m *tencentcloudManagerImpl) RegisterAsg(asg Asg) {
m.cache.RegisterAsg(asg)
}
// GetAsgForInstance returns asg of the given Instance
func (m *tencentcloudManagerImpl) GetAsgForInstance(instance TcRef) (Asg, error) {
return m.cache.FindForInstance(instance)
}
// GetAsgSize gets ASG size.
func (m *tencentcloudManagerImpl) GetAsgSize(asg Asg) (int64, error) {
targetSize, found := m.cache.GetAsgTargetSize(asg.TencentcloudRef())
if found {
return targetSize, nil
}
group, err := m.cloudService.GetAutoScalingGroup(asg.TencentcloudRef())
if err != nil {
return -1, err
}
if group.DesiredCapacity == nil {
return -1, fmt.Errorf("%s invalid desired capacity", asg.Id())
}
m.cache.SetAsgTargetSize(asg.TencentcloudRef(), *group.DesiredCapacity)
return *group.DesiredCapacity, nil
}
// SetAsgSize sets ASG size.
func (m *tencentcloudManagerImpl) SetAsgSize(asg Asg, size int64) error {
klog.V(0).Infof("Setting asg %s size to %d", asg.Id(), size)
err := m.cloudService.ResizeAsg(asg.TencentcloudRef(), uint64(size))
if err != nil {
return err
}
m.cache.SetAsgTargetSize(asg.TencentcloudRef(), size)
return nil
}
// Cleanup ...
func (m *tencentcloudManagerImpl) Cleanup() error {
return nil
}
// DeleteInstances deletes the given instances. All instances must be controlled by the same ASG.
func (m *tencentcloudManagerImpl) DeleteInstances(instances []TcRef) error {
if len(instances) == 0 {
return nil
}
commonAsg, err := m.cache.FindForInstance(instances[0])
if err != nil {
return err
}
toDeleteInstances := make([]string, 0)
for _, instance := range instances {
asg, err := m.cache.FindForInstance(instance)
if err != nil {
return err
}
if asg != commonAsg {
return fmt.Errorf("can not delete instances which don't belong to the same ASG")
}
toDeleteInstances = append(toDeleteInstances, instance.ID)
}
m.cache.InvalidateAsgTargetSize(commonAsg.TencentcloudRef())
m.cache.cloudService.DeleteInstances(commonAsg, toDeleteInstances)
return nil
}
// GetAsgNodes returns Asg nodes.
func (m *tencentcloudManagerImpl) GetAsgNodes(asg Asg) ([]cloudprovider.Instance, error) {
result := make([]cloudprovider.Instance, 0)
instances, err := m.cloudService.GetAutoScalingInstances(asg.TencentcloudRef())
if err != nil {
return result, err
}
for _, instance := range instances {
if instance == nil || instance.LifeCycleState == nil || instance.InstanceId == nil {
continue
}
if *instance.LifeCycleState == "Removing" {
continue
}
instanceRef, err := m.cloudService.GetTencentcloudInstanceRef(instance)
if err != nil {
klog.Warning("failed to get tencentcloud instance ref:", err)
} else if instanceRef.Zone == "" {
klog.V(4).Infof("Skipping %s, that is scheduling by AS", *instance.InstanceId)
} else {
result = append(result, cloudprovider.Instance{Id: instanceRef.ToProviderID()})
}
}
return result, nil
}
// InstanceTemplate represents CVM template
type InstanceTemplate struct {
InstanceType string
Region string
Zone string
Cpu int64
Mem int64
Gpu int64
// gpu虚拟化资源
VCudaCore int64
VCudaMem int64
// vpc-cni的集群eni-ip资源
TKERouteENIIP int64
TKEDirectENI int64
Label map[string]string
Taints []*tke.Taint
}
// NetworkExtendedResources represents network extended resources
type NetworkExtendedResources struct {
TKERouteENIIP int64
TKEDirectENI int64
}
var networkExtendedResourcesMap = make(map[string]*NetworkExtendedResources)
// GetAsgInstanceTemplate returns instance template for Asg with given ref
func (m *tencentcloudManagerImpl) GetAsgInstanceTemplate(asgRef TcRef) (*InstanceTemplate, error) {
m.mutex.Lock()
defer m.mutex.Unlock()
instanceTemplate, found := m.cache.GetAsgInstanceTemplate(asgRef)
if found {
return instanceTemplate, nil
}
getNetworkExtendedResources := func(instanceType string) (*NetworkExtendedResources, error) {
if resources, exist := networkExtendedResourcesMap[instanceType]; exist {
return resources, nil
}
pli, err := m.cloudService.DescribeVpcCniPodLimits(instanceType)
if err != nil {
return nil, err
}
resources := &NetworkExtendedResources{}
if pli != nil {
if pli.PodLimits == nil ||
pli.PodLimits.TKERouteENINonStaticIP == nil ||
pli.PodLimits.TKEDirectENI == nil {
return nil, fmt.Errorf("get wrong eni limits(nil)")
}
resources.TKEDirectENI = *pli.PodLimits.TKEDirectENI
resources.TKERouteENIIP = *pli.PodLimits.TKERouteENINonStaticIP
}
klog.Infof("%v", resources)
networkExtendedResourcesMap[instanceType] = resources
return resources, nil
}
instanceInfo, err := m.cloudService.GetInstanceInfoByType(m.cache.GetInstanceType(asgRef))
if err != nil {
return nil, err
}
npInfo, err := m.cloudService.GetNodePoolInfo(cloudConfig.ClusterID, asgRef.ID)
if err != nil {
return nil, err
}
labels := make(map[string]string)
for _, label := range npInfo.Labels {
if label.Name != nil && label.Value != nil {
labels[*label.Name] = *label.Value
}
}
labels[LabelAutoScalingGroupID] = asgRef.ID
asg, err := m.cloudService.GetAutoScalingGroup(asgRef)
if err != nil {
return nil, err
}
if len(asg.SubnetIdSet) < 1 || asg.SubnetIdSet[0] == nil {
return nil, fmt.Errorf("Failed to get asg zone")
}
zone, err := m.cloudService.GetZoneBySubnetID(*asg.SubnetIdSet[0])
if err != nil {
return nil, err
}
zoneInfo, err := m.cloudService.GetZoneInfo(zone)
if err != nil {
return nil, err
}
// eni
networkExtendedResources, err := getNetworkExtendedResources(instanceInfo.InstanceType)
if err != nil {
return nil, err
}
gpuMult, ok := GPUMemoryMap[instanceInfo.InstanceFamily]
if !ok {
gpuMult = 24
}
vcudaMem := instanceInfo.GPU * gpuMult * 4
instanceTemplate = &InstanceTemplate{
InstanceType: instanceInfo.InstanceType,
Region: cloudConfig.Region,
Zone: *zoneInfo.ZoneId,
Cpu: instanceInfo.CPU,
Mem: instanceInfo.Memory,
Gpu: instanceInfo.GPU,
VCudaCore: instanceInfo.GPU * 100,
VCudaMem: vcudaMem,
TKEDirectENI: networkExtendedResources.TKEDirectENI,
TKERouteENIIP: networkExtendedResources.TKERouteENIIP,
Label: labels,
Taints: npInfo.Taints,
}
m.cache.SetAsgInstanceTemplate(asgRef, instanceTemplate)
return instanceTemplate, nil
}
func (m *tencentcloudManagerImpl) GetAsgTemplateNode(asg Asg) (*apiv1.Node, error) {
template, err := m.GetAsgInstanceTemplate(asg.TencentcloudRef())
if err != nil {
return nil, err
}
node := apiv1.Node{}
nodeName := fmt.Sprintf("%s-%d", asg.TencentcloudRef().ID, rand.Int63())
node.ObjectMeta = metav1.ObjectMeta{
Name: nodeName,
SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName),
Labels: map[string]string{},
}
node.Status = apiv1.NodeStatus{
Capacity: apiv1.ResourceList{},
}
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(template.Cpu, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.Mem*1024*1024*1024, resource.DecimalSI)
if template.TKERouteENIIP > 0 {
node.Status.Capacity[TKERouteENIIP] = *resource.NewQuantity(template.TKERouteENIIP, resource.DecimalSI)
}
if template.TKEDirectENI > 0 {
node.Status.Capacity[TKEDirectENI] = *resource.NewQuantity(template.TKEDirectENI, resource.DecimalSI)
}
if template.Gpu > 0 {
node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.Gpu, resource.DecimalSI)
node.Status.Capacity[VCudaCore] = *resource.NewQuantity(template.VCudaCore, resource.DecimalSI)
node.Status.Capacity[VCudaMemory] = *resource.NewQuantity(template.VCudaMem, resource.DecimalSI)
klog.Infof("Capacity resource set gpu %s(%d)", gpu.ResourceNvidiaGPU, template.Gpu)
}
// TODO: use proper allocatable!!
node.Status.Allocatable = node.Status.Capacity
node.Labels = cloudprovider.JoinStringMaps(node.Labels, template.Label)
// GenericLabels
node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabels(template, nodeName))
node.Spec.Taints = extractTaintsFromAsg(template.Taints)
node.Status.Conditions = cloudprovider.BuildReadyConditions()
return &node, nil
}
func buildGenericLabels(template *InstanceTemplate, nodeName string) map[string]string {
result := make(map[string]string)
// TODO: extract it somehow
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
result[apiv1.LabelOSStable] = cloudprovider.DefaultOS
result[apiv1.LabelInstanceType] = template.InstanceType
result[apiv1.LabelZoneRegion] = template.Region
result[apiv1.LabelZoneFailureDomain] = template.Zone
result[apiv1.LabelHostname] = nodeName
return result
}
func extractTaintsFromAsg(npTaints []*tke.Taint) []apiv1.Taint {
taints := make([]apiv1.Taint, 0)
for _, npTaint := range npTaints {
if npTaint != nil && npTaint.Key != nil && npTaint.Value != nil && npTaint.Effect != nil {
taints = append(taints, apiv1.Taint{
Key: *npTaint.Key,
Value: *npTaint.Value,
Effect: apiv1.TaintEffect(*npTaint.Effect),
})
}
}
return taints
}