Merge pull request #8395 from laoj2/add-label-to-vpa-metric

Add update_mode label to VPA updater metrics
Merge pull request #8297 from pat-s/feat/imagesForArch-nodepool
2025-08-02 02:57:37 -07:00 · 2025-07-31 23:47:15 -07:00 · 2025-07-31 15:16:45 -04:00 · 2025-07-31 11:01:15 -07:00 · 2025-07-31 02:33:15 -07:00 · 2025-07-30 18:18:27 -07:00
32 changed files with 1173 additions and 147 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -23,7 +23,7 @@ We'd love to accept your patches! Before we can take them, we have to jump a cou

 All changes must be code reviewed. Coding conventions and standards are explained in the official 
 [developer docs](https://github.com/kubernetes/community/tree/master/contributors/devel). Expect 
-reviewers to request that you avoid common [go style mistakes](https://github.com/golang/go/wiki/CodeReviewComments)
+reviewers to request that you avoid common [go style mistakes](https://go.dev/wiki/CodeReviewComments)
 in your PRs.

 ### Merge Approval
--- a/cluster-autoscaler/Makefile
+++ b/cluster-autoscaler/Makefile
@ -107,7 +107,20 @@ build-in-docker-arch-%: clean-arch-% docker-builder
 	docker run ${RM_FLAG} -v `pwd`:/gopath/src/k8s.io/autoscaler/cluster-autoscaler/:Z autoscaling-builder:latest \
 		bash -c 'cd /gopath/src/k8s.io/autoscaler/cluster-autoscaler && BUILD_TAGS=${BUILD_TAGS} LDFLAGS="${LDFLAGS}" make build-arch-$*'

-release: $(addprefix build-in-docker-arch-,$(ALL_ARCH)) execute-release
+release-extract-version = $(shell cat version/version.go | grep "Version =" | cut -d '"' -f 2)
+
+release-validate:
+	@if [ -z $(shell git tag --points-at HEAD | grep -e ^cluster-autoscaler-1.[1-9][0-9]*.[0-9][0-9]*$) ]; then \
+		echo "Can't release from this commit, there is no compatible git tag"; \
+		exit 1; \
+	fi
+	@if [ -z $(shell git tag --points-at HEAD | grep -e $(call release-extract-version)) ]; then \
+		echo "Can't release from this commit, git tag does not match version/version.go"; \
+		exit 1; \
+	fi
+
+release: TAG=v$(call release-extract-version)
+release: release-validate $(addprefix build-in-docker-arch-,$(ALL_ARCH)) execute-release
 	@echo "Full in-docker release ${TAG}${FOR_PROVIDER} completed"

 container: container-arch-$(GOARCH)
--- a/cluster-autoscaler/OWNERS
+++ b/cluster-autoscaler/OWNERS
@ -10,5 +10,6 @@ reviewers:
 - feiskyer
 - vadasambar
 - x13n
+- elmiko
 labels:
 - area/cluster-autoscaler
--- a/cluster-autoscaler/cloudprovider/azure/azure_config.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go
@ -103,6 +103,9 @@ type Config struct {

 	// EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning
 	EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"`
+
+	// EnableLabelPredictionsOnTemplate defines whether to enable label predictions on the template when scaling from zero
+	EnableLabelPredictionsOnTemplate bool `json:"enableLabelPredictionsOnTemplate,omitempty" yaml:"enableLabelPredictionsOnTemplate,omitempty"`
 }

 // These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
@ -133,6 +136,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
 	cfg.VMType = providerazureconsts.VMTypeVMSS
 	cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
 	cfg.StrictCacheUpdates = false
+	cfg.EnableLabelPredictionsOnTemplate = true

 	// Config file overrides defaults
 	if configReader != nil {
@ -308,6 +312,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
 	if _, err = assignBoolFromEnvIfExists(&cfg.EnableFastDeleteOnFailedProvisioning, "AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); err != nil {
 		return nil, err
 	}
+	if _, err = assignBoolFromEnvIfExists(&cfg.EnableLabelPredictionsOnTemplate, "AZURE_ENABLE_LABEL_PREDICTIONS_ON_TEMPLATE"); err != nil {
+		return nil, err
+	}

 	// Nonstatic defaults
 	cfg.VMType = strings.ToLower(cfg.VMType)
--- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
@ -89,6 +89,8 @@ type ScaleSet struct {
 	dedicatedHost bool

 	enableFastDeleteOnFailedProvisioning bool
+
+	enableLabelPredictionsOnTemplate bool
 }

 // NewScaleSet creates a new NewScaleSet.
@ -108,10 +110,11 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d
 			instancesRefreshJitter: az.config.VmssVmsCacheJitter,
 		},

-		enableForceDelete:         az.config.EnableForceDelete,
-		enableDynamicInstanceList: az.config.EnableDynamicInstanceList,
-		enableDetailedCSEMessage:  az.config.EnableDetailedCSEMessage,
-		dedicatedHost:             dedicatedHost,
+		enableForceDelete:                az.config.EnableForceDelete,
+		enableDynamicInstanceList:        az.config.EnableDynamicInstanceList,
+		enableDetailedCSEMessage:         az.config.EnableDetailedCSEMessage,
+		enableLabelPredictionsOnTemplate: az.config.EnableLabelPredictionsOnTemplate,
+		dedicatedHost:                    dedicatedHost,
 	}

 	if az.config.VmssVirtualMachinesCacheTTLInSeconds != 0 {
@ -662,7 +665,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*framework.NodeInfo, error) {
 	if err != nil {
 		return nil, err
 	}
-	node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList)
+	node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList, scaleSet.enableLabelPredictionsOnTemplate)
 	if err != nil {
 		return nil, err
 	}
--- a/cluster-autoscaler/cloudprovider/azure/azure_template.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go
@ -211,7 +211,7 @@ func buildNodeTemplateFromVMPool(vmsPool armcontainerservice.AgentPool, location
 	}, nil
 }

-func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) {
+func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager *AzureManager, enableDynamicInstanceList bool, enableLabelPrediction bool) (*apiv1.Node, error) {
 	node := apiv1.Node{}
 	nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63())

@ -272,7 +272,7 @@ func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager
 	node.Status.Allocatable = node.Status.Capacity

 	if template.VMSSNodeTemplate != nil {
-		node = processVMSSTemplate(template, nodeName, node)
+		node = processVMSSTemplate(template, nodeName, node, enableLabelPrediction)
 	} else if template.VMPoolNodeTemplate != nil {
 		node = processVMPoolTemplate(template, nodeName, node)
 	} else {
@ -298,7 +298,7 @@ func processVMPoolTemplate(template NodeTemplate, nodeName string, node apiv1.No
 	return node
 }

-func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node) apiv1.Node {
+func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node, enableLabelPrediction bool) apiv1.Node {
 	// NodeLabels
 	if template.VMSSNodeTemplate.Tags != nil {
 		for k, v := range template.VMSSNodeTemplate.Tags {
@ -324,45 +324,50 @@ func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node
 		labels = extractLabelsFromTags(template.VMSSNodeTemplate.Tags)
 	}

-	// Add the agentpool label, its value should come from the VMSS poolName tag
-	// NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one
-	// We will have to live with both labels for a while
-	if node.Labels[legacyPoolNameTag] != "" {
-		labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
-		labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
-	}
-	if node.Labels[poolNameTag] != "" {
-		labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag]
-		labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag]
-	}
+	// This is the best-effort to match AKS system labels,
+	// this prediction needs to be constantly worked on and maintained to keep up with the changes in AKS
+	if enableLabelPrediction {
+		// Add the agentpool label, its value should come from the VMSS poolName tag
+		// NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one
+		// We will have to live with both labels for a while
+		if node.Labels[legacyPoolNameTag] != "" {
+			labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
+			labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
+		}
+		if node.Labels[poolNameTag] != "" {
+			labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag]
+			labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag]
+		}

-	// Add the storage profile and storage tier labels for vmss node
-	if template.VMSSNodeTemplate.OSDisk != nil {
-		// ephemeral
-		if template.VMSSNodeTemplate.OSDisk.DiffDiskSettings != nil && template.VMSSNodeTemplate.OSDisk.DiffDiskSettings.Option == compute.Local {
-			labels[legacyStorageProfileNodeLabelKey] = "ephemeral"
-			labels[storageProfileNodeLabelKey] = "ephemeral"
-		} else {
-			labels[legacyStorageProfileNodeLabelKey] = "managed"
-			labels[storageProfileNodeLabelKey] = "managed"
+		// Add the storage profile and storage tier labels for vmss node
+		if template.VMSSNodeTemplate.OSDisk != nil {
+			// ephemeral
+			if template.VMSSNodeTemplate.OSDisk.DiffDiskSettings != nil && template.VMSSNodeTemplate.OSDisk.DiffDiskSettings.Option == compute.Local {
+				labels[legacyStorageProfileNodeLabelKey] = "ephemeral"
+				labels[storageProfileNodeLabelKey] = "ephemeral"
+			} else {
+				labels[legacyStorageProfileNodeLabelKey] = "managed"
+				labels[storageProfileNodeLabelKey] = "managed"
+			}
+			if template.VMSSNodeTemplate.OSDisk.ManagedDisk != nil {
+				labels[legacyStorageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
+				labels[storageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
+			}
 		}
-		if template.VMSSNodeTemplate.OSDisk.ManagedDisk != nil {
-			labels[legacyStorageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
-			labels[storageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
-		}
-		// Add ephemeral-storage value
-		if template.VMSSNodeTemplate.OSDisk.DiskSizeGB != nil {
-			node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VMSSNodeTemplate.OSDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI)
-			klog.V(4).Infof("OS Disk Size from template is: %d", *template.VMSSNodeTemplate.OSDisk.DiskSizeGB)
-			klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage])
+
+		// If we are on GPU-enabled SKUs, append the accelerator
+		// label so that CA makes better decision when scaling from zero for GPU pools
+		if isNvidiaEnabledSKU(template.SkuName) {
+			labels[GPULabel] = "nvidia"
+			labels[legacyGPULabel] = "nvidia"
 		}
 	}

-	// If we are on GPU-enabled SKUs, append the accelerator
-	// label so that CA makes better decision when scaling from zero for GPU pools
-	if isNvidiaEnabledSKU(template.SkuName) {
-		labels[GPULabel] = "nvidia"
-		labels[legacyGPULabel] = "nvidia"
+	// Add ephemeral-storage value
+	if template.VMSSNodeTemplate.OSDisk != nil && template.VMSSNodeTemplate.OSDisk.DiskSizeGB != nil {
+		node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VMSSNodeTemplate.OSDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI)
+		klog.V(4).Infof("OS Disk Size from template is: %d", *template.VMSSNodeTemplate.OSDisk.DiskSizeGB)
+		klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage])
 	}

 	// Extract allocatables from tags
--- a/cluster-autoscaler/cloudprovider/azure/azure_template_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_template_test.go
@ -291,3 +291,91 @@ func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool {
 	}
 	return set
 }
+
+func TestBuildNodeFromTemplateWithLabelPrediction(t *testing.T) {
+	poolName := "testpool"
+	testSkuName := "Standard_DS2_v2"
+	testNodeName := "test-node"
+
+	vmss := compute.VirtualMachineScaleSet{
+		Response: autorest.Response{},
+		Sku:      &compute.Sku{Name: &testSkuName},
+		Plan:     nil,
+		VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{
+			VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{
+				StorageProfile: &compute.VirtualMachineScaleSetStorageProfile{
+					OsDisk: &compute.VirtualMachineScaleSetOSDisk{
+						DiffDiskSettings: nil, // This makes it managed
+						ManagedDisk: &compute.VirtualMachineScaleSetManagedDiskParameters{
+							StorageAccountType: compute.StorageAccountTypesPremiumLRS,
+						},
+					},
+				},
+			},
+		},
+		Tags: map[string]*string{
+			"poolName": &poolName,
+		},
+		Zones:    &[]string{"1", "2"},
+		Location: to.StringPtr("westus"),
+	}
+
+	template, err := buildNodeTemplateFromVMSS(vmss, map[string]string{}, "")
+	assert.NoError(t, err)
+
+	manager := &AzureManager{}
+	node, err := buildNodeFromTemplate(testNodeName, template, manager, false, true)
+	assert.NoError(t, err)
+	assert.NotNil(t, node)
+
+	// Verify label prediction labels are added
+	assert.Equal(t, poolName, node.Labels["agentpool"])
+	assert.Equal(t, poolName, node.Labels["kubernetes.azure.com/agentpool"])
+	assert.Equal(t, "managed", node.Labels["storageprofile"])
+	assert.Equal(t, "managed", node.Labels["kubernetes.azure.com/storageprofile"])
+}
+
+func TestBuildNodeFromTemplateWithEphemeralStorage(t *testing.T) {
+	poolName := "testpool"
+	testSkuName := "Standard_DS2_v2"
+	testNodeName := "test-node"
+	diskSizeGB := int32(128)
+
+	vmss := compute.VirtualMachineScaleSet{
+		Response: autorest.Response{},
+		Sku:      &compute.Sku{Name: &testSkuName},
+		Plan:     nil,
+		VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{
+			VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{
+				StorageProfile: &compute.VirtualMachineScaleSetStorageProfile{
+					OsDisk: &compute.VirtualMachineScaleSetOSDisk{
+						DiskSizeGB:       &diskSizeGB,
+						DiffDiskSettings: nil, // This makes it managed
+						ManagedDisk: &compute.VirtualMachineScaleSetManagedDiskParameters{
+							StorageAccountType: compute.StorageAccountTypesPremiumLRS,
+						},
+					},
+				},
+			},
+		},
+		Tags: map[string]*string{
+			"poolName": &poolName,
+		},
+		Zones:    &[]string{"1", "2"},
+		Location: to.StringPtr("westus"),
+	}
+
+	template, err := buildNodeTemplateFromVMSS(vmss, map[string]string{}, "")
+	assert.NoError(t, err)
+
+	manager := &AzureManager{}
+	node, err := buildNodeFromTemplate(testNodeName, template, manager, false, false)
+	assert.NoError(t, err)
+	assert.NotNil(t, node)
+
+	// Verify ephemeral storage is set correctly
+	expectedEphemeralStorage := resource.NewQuantity(int64(diskSizeGB)*1024*1024*1024, resource.DecimalSI)
+	ephemeralStorage, exists := node.Status.Capacity[apiv1.ResourceEphemeralStorage]
+	assert.True(t, exists)
+	assert.Equal(t, expectedEphemeralStorage.String(), ephemeralStorage.String())
+}
--- a/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go
@ -469,7 +469,7 @@ func (vmPool *VMPool) TemplateNodeInfo() (*framework.NodeInfo, error) {
 	if err != nil {
 		return nil, err
 	}
-	node, err := buildNodeFromTemplate(vmPool.agentPoolName, template, vmPool.manager, vmPool.manager.config.EnableDynamicInstanceList)
+	node, err := buildNodeFromTemplate(vmPool.agentPoolName, template, vmPool.manager, vmPool.manager.config.EnableDynamicInstanceList, false)
 	if err != nil {
 		return nil, err
 	}
--- a/cluster-autoscaler/cloudprovider/clusterapi/README.md
+++ b/cluster-autoscaler/cloudprovider/clusterapi/README.md
@ -79,6 +79,12 @@ in the staging namespace, belonging to the purple cluster, with the label owner=

 ## Connecting cluster-autoscaler to Cluster API management and workload Clusters

+> [!IMPORTANT]
+> `--cloud-config` is the flag for specifying a mount volume path to the kubernetes configuration (ie KUBECONFIG) to the cluster-autoscaler for communicating with the cluster-api management cluster for the purpose of scaling machines.
+
+> [!IMPORTANT]
+> ``--kubeconfig` is the flag for specifying a mount volume  path to the kubernetes configuration (ie KUBECONFIG) to the cluster-autoscaler for communicating with the cluster-api workload cluster for the purpose of watching Nodes and Pods. This flag can be affected by the desired topology for deploying the cluster-autoscaler, please see the diagrams below for more information.
+
 You will also need to provide the path to the kubeconfig(s) for the management
 and workload cluster you wish cluster-autoscaler to run against. To specify the
 kubeconfig path for the workload cluster to monitor, use the `--kubeconfig`
--- a/cluster-autoscaler/cloudprovider/hetzner/README.md
+++ b/cluster-autoscaler/cloudprovider/hetzner/README.md
@ -10,16 +10,16 @@ The cluster autoscaler for Hetzner Cloud scales worker nodes.

 `HCLOUD_IMAGE` Defaults to `ubuntu-20.04`, @see https://docs.hetzner.cloud/#images. You can also use an image ID here (e.g. `15512617`), or a label selector associated with a custom snapshot (e.g. `customized_ubuntu=true`). The most recent snapshot will be used in the latter case.

-`HCLOUD_CLUSTER_CONFIG` This is the new format replacing 
- * `HCLOUD_CLOUD_INIT` 
- * `HCLOUD_IMAGE` 
- 
+`HCLOUD_CLUSTER_CONFIG` This is the new format replacing
+ * `HCLOUD_CLOUD_INIT`
+ * `HCLOUD_IMAGE`
+
 Base64 encoded JSON according to the following structure

 ```json
 {
    "imagesForArch": { // These should be the same format as HCLOUD_IMAGE
-        "arm64": "", 
+        "arm64": "",
        "amd64": ""
    },
    "nodeConfigs": {
@ -28,7 +28,7 @@ The cluster autoscaler for Hetzner Cloud scales worker nodes.
            "labels": {
                "node.kubernetes.io/role": "autoscaler-node"
            },
-            "taints": 
+            "taints":
            [
                {
                    "key": "node.kubernetes.io/role",
@ -47,6 +47,13 @@ Can be useful when you have many different node pools and run into issues of the

 **NOTE**: In contrast to `HCLOUD_CLUSTER_CONFIG`, this file is not base64 encoded.

+The global `imagesForArch` configuration can be overridden on a per-nodepool basis by adding an `imagesForArch` field to individual nodepool configurations.
+
+The image selection logic works as follows:
+
+1. If a nodepool has its own `imagesForArch` configuration, it will be used for that specific nodepool
+1. If a nodepool doesn't have `imagesForArch` configured, the global `imagesForArch` configuration will be used as a fallback
+1. If neither is configured, the legacy `HCLOUD_IMAGE` environment variable will be used

 `HCLOUD_NETWORK` Default empty , The id or name of the network that is used in the cluster , @see https://docs.hetzner.cloud/#networks

@ -105,5 +112,5 @@ git add hcloud-go/

 ## Debugging

-To enable debug logging, set the log level of the autoscaler to at least level 5 via cli flag: `--v=5`  
-The logs will include all requests and responses made towards the Hetzner API including headers and body.
+To enable debug logging, set the log level of the autoscaler to at least level 5 via cli flag: `--v=5`
+The logs will include all requests and responses made towards the Hetzner API including headers and body.
--- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go
+++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go
@ -77,6 +77,7 @@ type NodeConfig struct {
 	PlacementGroup string
 	Taints         []apiv1.Taint
 	Labels         map[string]string
+	ImagesForArch  *ImageList
 }

 // LegacyConfig holds the configuration in the legacy format
--- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go
+++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go
@ -528,12 +528,20 @@ func findImage(n *hetznerNodeGroup, serverType *hcloud.ServerType) (*hcloud.Imag
 	// Select correct image based on server type architecture
 	imageName := n.manager.clusterConfig.LegacyConfig.ImageName
 	if n.manager.clusterConfig.IsUsingNewFormat {
+		// Check for nodepool-specific images first, then fall back to global images
+		var imagesForArch *ImageList
+		if nodeConfig, exists := n.manager.clusterConfig.NodeConfigs[n.id]; exists && nodeConfig.ImagesForArch != nil {
+			imagesForArch = nodeConfig.ImagesForArch
+		} else {
+			imagesForArch = &n.manager.clusterConfig.ImagesForArch
+		}
+
 		if serverType.Architecture == hcloud.ArchitectureARM {
-			imageName = n.manager.clusterConfig.ImagesForArch.Arm64
+			imageName = imagesForArch.Arm64
 		}

 		if serverType.Architecture == hcloud.ArchitectureX86 {
-			imageName = n.manager.clusterConfig.ImagesForArch.Amd64
+			imageName = imagesForArch.Amd64
 		}
 	}

--- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group_test.go
+++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group_test.go
@ -0,0 +1,174 @@
+/*
+Copyright 2019 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package hetzner
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestFindImageWithPerNodepoolConfig(t *testing.T) {
+	// Test case 1: Nodepool with specific imagesForArch should use those images
+	t.Run("nodepool with specific imagesForArch", func(t *testing.T) {
+		manager := &hetznerManager{
+			clusterConfig: &ClusterConfig{
+				IsUsingNewFormat: true,
+				ImagesForArch: ImageList{
+					Arm64: "global-arm64-image",
+					Amd64: "global-amd64-image",
+				},
+				NodeConfigs: map[string]*NodeConfig{
+					"pool1": {
+						ImagesForArch: &ImageList{
+							Arm64: "pool1-arm64-image",
+							Amd64: "pool1-amd64-image",
+						},
+					},
+				},
+			},
+		}
+
+		nodeGroup := &hetznerNodeGroup{
+			id:      "pool1",
+			manager: manager,
+		}
+
+		// This would normally call the actual API, but we're just testing the logic
+		// The actual image selection logic is in findImage function
+		// For this test, we'll verify the configuration is set up correctly
+		nodeConfig, exists := manager.clusterConfig.NodeConfigs[nodeGroup.id]
+		require.True(t, exists)
+		require.NotNil(t, nodeConfig.ImagesForArch)
+		assert.Equal(t, "pool1-arm64-image", nodeConfig.ImagesForArch.Arm64)
+		assert.Equal(t, "pool1-amd64-image", nodeConfig.ImagesForArch.Amd64)
+	})
+
+	// Test case 2: Nodepool without specific imagesForArch should fall back to global
+	t.Run("nodepool without specific imagesForArch", func(t *testing.T) {
+		manager := &hetznerManager{
+			clusterConfig: &ClusterConfig{
+				IsUsingNewFormat: true,
+				ImagesForArch: ImageList{
+					Arm64: "global-arm64-image",
+					Amd64: "global-amd64-image",
+				},
+				NodeConfigs: map[string]*NodeConfig{
+					"pool2": {
+						// No ImagesForArch specified
+					},
+				},
+			},
+		}
+
+		nodeGroup := &hetznerNodeGroup{
+			id:      "pool2",
+			manager: manager,
+		}
+
+		nodeConfig, exists := manager.clusterConfig.NodeConfigs[nodeGroup.id]
+		require.True(t, exists)
+		assert.Nil(t, nodeConfig.ImagesForArch)
+		assert.Equal(t, "global-arm64-image", manager.clusterConfig.ImagesForArch.Arm64)
+		assert.Equal(t, "global-amd64-image", manager.clusterConfig.ImagesForArch.Amd64)
+	})
+
+	// Test case 3: Nodepool with nil ImagesForArch should fall back to global
+	t.Run("nodepool with nil imagesForArch", func(t *testing.T) {
+		manager := &hetznerManager{
+			clusterConfig: &ClusterConfig{
+				IsUsingNewFormat: true,
+				ImagesForArch: ImageList{
+					Arm64: "global-arm64-image",
+					Amd64: "global-amd64-image",
+				},
+				NodeConfigs: map[string]*NodeConfig{
+					"pool3": {
+						ImagesForArch: nil, // Explicitly nil
+					},
+				},
+			},
+		}
+
+		nodeGroup := &hetznerNodeGroup{
+			id:      "pool3",
+			manager: manager,
+		}
+
+		nodeConfig, exists := manager.clusterConfig.NodeConfigs[nodeGroup.id]
+		require.True(t, exists)
+		assert.Nil(t, nodeConfig.ImagesForArch)
+		assert.Equal(t, "global-arm64-image", manager.clusterConfig.ImagesForArch.Arm64)
+		assert.Equal(t, "global-amd64-image", manager.clusterConfig.ImagesForArch.Amd64)
+	})
+}
+
+func TestImageSelectionLogic(t *testing.T) {
+	// Test the image selection logic that would be used in findImage function
+	t.Run("image selection logic", func(t *testing.T) {
+		manager := &hetznerManager{
+			clusterConfig: &ClusterConfig{
+				IsUsingNewFormat: true,
+				ImagesForArch: ImageList{
+					Arm64: "global-arm64-image",
+					Amd64: "global-amd64-image",
+				},
+				NodeConfigs: map[string]*NodeConfig{
+					"pool1": {
+						ImagesForArch: &ImageList{
+							Arm64: "pool1-arm64-image",
+							Amd64: "pool1-amd64-image",
+						},
+					},
+					"pool2": {
+						// No ImagesForArch specified
+					},
+				},
+			},
+		}
+
+		// Test pool1 (has specific imagesForArch)
+		nodeConfig, exists := manager.clusterConfig.NodeConfigs["pool1"]
+		require.True(t, exists)
+		require.NotNil(t, nodeConfig.ImagesForArch)
+
+		var imagesForArch *ImageList
+		if nodeConfig.ImagesForArch != nil {
+			imagesForArch = nodeConfig.ImagesForArch
+		} else {
+			imagesForArch = &manager.clusterConfig.ImagesForArch
+		}
+
+		assert.Equal(t, "pool1-arm64-image", imagesForArch.Arm64)
+		assert.Equal(t, "pool1-amd64-image", imagesForArch.Amd64)
+
+		// Test pool2 (no specific imagesForArch, should use global)
+		nodeConfig, exists = manager.clusterConfig.NodeConfigs["pool2"]
+		require.True(t, exists)
+		assert.Nil(t, nodeConfig.ImagesForArch)
+
+		if nodeConfig.ImagesForArch != nil {
+			imagesForArch = nodeConfig.ImagesForArch
+		} else {
+			imagesForArch = &manager.clusterConfig.ImagesForArch
+		}
+
+		assert.Equal(t, "global-arm64-image", imagesForArch.Arm64)
+		assert.Equal(t, "global-amd64-image", imagesForArch.Amd64)
+	})
+}
--- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
+++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
@ -461,12 +461,6 @@ func (m *ociManagerImpl) GetExistingNodePoolSizeViaCompute(np NodePool) (int, er
 			if !strings.HasPrefix(*item.DisplayName, displayNamePrefix) {
 				continue
 			}
-			// A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be
-			// returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it.
-			if *item.Id == "" {
-				klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance")
-				continue
-			}
 			switch item.LifecycleState {
 			case core.InstanceLifecycleStateStopped, core.InstanceLifecycleStateTerminated:
 				klog.V(4).Infof("skipping instance is in stopped/terminated state: %q", *item.Id)
@ -525,25 +519,23 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance

 	nodePool, err := m.nodePoolCache.get(np.Id())
 	if err != nil {
+		klog.Error(err, "error while performing GetNodePoolNodes call")
 		return nil, err
 	}

 	var instances []cloudprovider.Instance
 	for _, node := range nodePool.Nodes {

-		// A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be
-		// returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it.
-		if *node.Id == "" {
-			klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance")
-			continue
-		}
-
 		if node.NodeError != nil {

+			// We should move away from the approach of determining a node error as a Out of host capacity
+			// through string comparison. An error code specifically for Out of host capacity must be set
+			// and returned in the API response.
 			errorClass := cloudprovider.OtherErrorClass
 			if *node.NodeError.Code == "LimitExceeded" ||
-				(*node.NodeError.Code == "InternalServerError" &&
-					strings.Contains(*node.NodeError.Message, "quota")) {
+				*node.NodeError.Code == "QuotaExceeded" ||
+				(*node.NodeError.Code == "InternalError" &&
+					strings.Contains(*node.NodeError.Message, "Out of host capacity")) {
 				errorClass = cloudprovider.OutOfResourcesErrorClass
 			}

--- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go
@ -6,12 +6,11 @@ package nodepools

 import (
 	"context"
+	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
 	"net/http"
 	"reflect"
 	"testing"

-	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
-
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/vendor-internal/github.com/oracle/oci-go-sdk/v65/common"
@ -120,16 +119,10 @@ func TestGetNodePoolNodes(t *testing.T) {
 			{
 				Id: common.String("node8"),
 				NodeError: &oke.NodeError{
-					Code:    common.String("InternalServerError"),
-					Message: common.String("blah blah quota exceeded blah blah"),
+					Code:    common.String("InternalError"),
+					Message: common.String("blah blah Out of host capacity blah blah"),
 				},
 			},
-			{
-				// This case happens if a node fails to scale up due to lack of capacity in the region.
-				// It's not a real node, so we shouldn't return it in the list of nodes.
-				Id:             common.String(""),
-				LifecycleState: oke.NodeLifecycleStateCreating,
-			},
 		},
 	}

@ -186,8 +179,8 @@ func TestGetNodePoolNodes(t *testing.T) {
 				State: cloudprovider.InstanceCreating,
 				ErrorInfo: &cloudprovider.InstanceErrorInfo{
 					ErrorClass:   cloudprovider.OutOfResourcesErrorClass,
-					ErrorCode:    "InternalServerError",
-					ErrorMessage: "blah blah quota exceeded blah blah",
+					ErrorCode:    "InternalError",
+					ErrorMessage: "blah blah Out of host capacity blah blah",
 				},
 			},
 		},
--- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go
+++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go
@ -214,6 +214,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
 		}
 	}
 	klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool)
+	np.manager.InvalidateAndRefreshCache()
+	nodes, err := np.manager.GetNodePoolNodes(np)
+	if err != nil {
+		klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
+		return err
+	}
+	// We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on
+	// the below approach to determine the number running instance in a nodepool from the compute API and
+	//update the size of the nodepool accordingly. We should move away from this approach once we have an API
+	// to delete a specific node without a compute instance.
+	if !decreaseTargetCheckViaComputeBool {
+		for _, node := range nodes {
+			if node.Status != nil && node.Status.ErrorInfo != nil {
+				if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
+					klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.")
+					decreaseTargetCheckViaComputeBool = true
+					break
+				}
+			}
+		}
+	}
 	var nodesLen int
 	if decreaseTargetCheckViaComputeBool {
 		nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np)
@ -222,12 +243,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
 			return err
 		}
 	} else {
-		np.manager.InvalidateAndRefreshCache()
-		nodes, err := np.manager.GetNodePoolNodes(np)
-		if err != nil {
-			klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
-			return err
-		}
 		nodesLen = len(nodes)
 	}

--- a/cluster-autoscaler/expander/grpcplugin/README.md
+++ b/cluster-autoscaler/expander/grpcplugin/README.md
@ -16,12 +16,12 @@ There are a wide variety of use cases here. Some examples are as follows:
 ## Configuration options
 As using this expander requires communication with another service, users must specify a few options as CLI arguments.

-```yaml
--grpcExpanderUrl
+```bash
+--grpc-expander-url
 ```
 URL of the gRPC Expander server, for CA to communicate with.
-```yaml
--grpcExpanderCert
+```bash
+--grpc-expander-cert
 ```
 Location of the volume mounted certificate of the gRPC server if it is configured to communicate over TLS

@ -32,7 +32,7 @@ service. Note that the `protos/expander.pb.go` generated protobuf code will also
 Communication between Cluster Autoscaler and the gRPC Server will occur over native kube-proxy. To use this, note the Service and Namespace the gRPC server is deployed in.

 Deploy the gRPC Expander Server as a separate app, listening on a specifc port number.
-Start Cluster Autoscaler with the `--grpcExapnderURl=SERVICE_NAME.NAMESPACE_NAME.svc.cluster.local:PORT_NUMBER` flag, as well as `--grpcExpanderCert` pointed at the location of the volume mounted certificate of the gRPC server.
+Start Cluster Autoscaler with the `--grpc-expander-url=SERVICE_NAME.NAMESPACE_NAME.svc.cluster.local:PORT_NUMBER` flag, as well as `--grpc-expander-cert` pointed at the location of the volume mounted certificate of the gRPC server.

 ## Details

--- a/vertical-pod-autoscaler/deploy/admission-controller-deployment.yaml
+++ b/vertical-pod-autoscaler/deploy/admission-controller-deployment.yaml
@ -43,6 +43,21 @@ spec:
            - containerPort: 8000
            - name: prometheus
              containerPort: 8944
+          livenessProbe:
+            httpGet:
+              path: /health-check
+              port: prometheus
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /health-check
+              port: prometheus
+              scheme: HTTP
+            periodSeconds: 10
+            failureThreshold: 3
      volumes:
        - name: tls-certs
          secret:
--- a/vertical-pod-autoscaler/deploy/recommender-deployment-high.yaml
+++ b/vertical-pod-autoscaler/deploy/recommender-deployment-high.yaml
@ -41,3 +41,18 @@ spec:
        ports:
        - name: prometheus
          containerPort: 8942
+        livenessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          periodSeconds: 10
+          failureThreshold: 3
--- a/vertical-pod-autoscaler/deploy/recommender-deployment-low.yaml
+++ b/vertical-pod-autoscaler/deploy/recommender-deployment-low.yaml
@ -41,3 +41,18 @@ spec:
        ports:
        - name: prometheus
          containerPort: 8942
+        livenessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          periodSeconds: 10
+          failureThreshold: 3
--- a/vertical-pod-autoscaler/deploy/recommender-deployment.yaml
+++ b/vertical-pod-autoscaler/deploy/recommender-deployment.yaml
@ -32,3 +32,18 @@ spec:
        ports:
        - name: prometheus
          containerPort: 8942
+        livenessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /health-check
+            port: prometheus
+            scheme: HTTP
+          periodSeconds: 10
+          failureThreshold: 3
--- a/vertical-pod-autoscaler/deploy/updater-deployment.yaml
+++ b/vertical-pod-autoscaler/deploy/updater-deployment.yaml
@ -37,3 +37,18 @@ spec:
          ports:
            - name: prometheus
              containerPort: 8943
+          livenessProbe:
+            httpGet:
+              path: /health-check
+              port: prometheus
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /health-check
+              port: prometheus
+              scheme: HTTP
+            periodSeconds: 10
+            failureThreshold: 3
--- a/vertical-pod-autoscaler/enhancements/8026-per-vpa-component-configuration/README.md
+++ b/vertical-pod-autoscaler/enhancements/8026-per-vpa-component-configuration/README.md
@ -0,0 +1,229 @@
+# AEP-8026: Allow per-VPA component configuration parameters
+
+<!-- toc -->
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [Parameter Descriptions](#parameter-descriptions)
+    - [Container Policy Parameters](#container-policy-parameters)
+    - [Update Policy Parameters](#update-policy-parameters)
+- [Design Details](#design-details)
+  - [API Changes](#api-changes)
+    - [Phase 1 (Current Proposal)](#phase-1-current-proposal)
+    - [Future Extensions](#future-extensions)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+    - [How can this feature be enabled / disabled in a live cluster?](#how-can-this-feature-be-enabled--disabled-in-a-live-cluster)
+  - [Kubernetes version compatibility](#kubernetes-version-compatibility)
+  - [Validation via CEL and Testing](#validation-via-cel-and-testing)
+  - [Test Plan](#test-plan)
+- [Implementation History](#implementation-history)
+- [Future Work](#future-work)
+- [Alternatives](#alternatives)
+  - [Multiple VPA Deployments](#multiple-vpa-deployments)
+  - [Environment-Specific Configuration](#environment-specific-configuration)
+<!-- /toc -->
+
+## Summary
+
+Currently, VPA components (recommender, updater, admission controller) are configured through global flags. This makes it challenging to support different workloads with varying resource optimization needs within the same cluster. This proposal introduces the ability to specify configuration parameters at the individual VPA object level, allowing for workload-specific optimization strategies.
+
+## Motivation
+
+Different types of workloads in a Kubernetes cluster often have different resource optimization requirements. For example:
+- Batch processing jobs might benefit from aggressive OOM handling and frequent adjustments
+- User-facing services might need more conservative growth patterns for stability
+- Development environments might need different settings than production
+
+Currently, supporting these different needs requires running multiple VPA component instances with different configurations, which increases operational complexity and resource usage.
+
+### Goals
+
+- Allow specification of component-specific parameters in individual VPA objects
+- Support different optimization strategies for different workloads in the same cluster
+- Maintain backward compatibility with existing global configuration
+- Initially support the following parameters:
+  - oomBumpUpRatio
+  - oomMinBumpUp
+  - memoryAggregationInterval
+  - memoryAggregationIntervalCount
+  - evictAfterOOMThreshold
+
+### Non-Goals
+
+- Converting all existing VPA flags to per-object configuration
+- Changing the core VPA algorithm or its decision-making process
+- Adding new optimization strategies
+
+## Proposal
+
+The configuration will be split into two sections: container-specific recommendations under `containerPolicies` and updater configuration under `updatePolicy`. This structure is designed to be extensible, allowing for additional parameters to be added in future iterations of this enhancement.
+
+```yaml
+apiVersion: autoscaling.k8s.io/v1
+kind: VerticalPodAutoscaler
+metadata:
+  name: oom-test-vpa
+spec:
+  targetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: oom-test
+  updatePolicy:
+    updateMode: Auto
+    evictAfterOOMThreshold: "5m"
+  resourcePolicy:
+    containerPolicies:
+    - containerName: "*"
+      oomBumpUpRatio: "1.5"
+      oomMinBumpUp: 104857600
+      memoryAggregationInterval: "12h"
+      memoryAggregationIntervalCount: 5
+```
+
+### Parameter Descriptions
+
+#### Container Policy Parameters
+#### Container Policy Parameters
+* `oomBumpUpRatio` (Quantity):
+  - Multiplier applied to memory recommendations after OOM events
+  - Represented as a Quantity (e.g., "1.5")
+  - Must be greater than 1
+  - Controls how aggressively memory is increased after container crashes
+
+* `oomMinBumpUp` (bytes): 
+  - Minimum absolute memory increase after OOM events
+  - Ensures meaningful increases even for small containers
+
+* `memoryAggregationInterval` (duration):
+  - Time window for aggregating memory usage data
+  - Affects how quickly VPA responds to memory usage changes
+
+* `memoryAggregationIntervalCount` (integer):
+  - Number of consecutive memory aggregation intervals
+  - Used to calculate the total memory aggregation window length
+  - Total window length = memoryAggregationInterval * memoryAggregationIntervalCount
+
+#### Update Policy Parameters
+* `evictAfterOOMThreshold` (duration):
+  - Time to wait after OOM before considering pod eviction
+  - Helps prevent rapid eviction cycles while maintaining stability
+
+Each parameter can be configured independently, falling back to global defaults if not specified. Values should be chosen based on workload characteristics and stability requirements.
+
+## Design Details
+
+### API Changes
+
+#### Phase 1 (Current Proposal)
+
+Extend `ContainerResourcePolicy` with:
+* `oomBumpUpRatio`
+* `oomMinBumpUp`
+* `memoryAggregationInterval`
+* `memoryAggregationIntervalCount`
+
+Extend `PodUpdatePolicy` with:
+* `evictAfterOOMThreshold`
+
+#### Future Extensions
+
+This AEP will be updated as additional parameters are identified for per-object configuration. Potential candidates include:
+* `confidenceIntervalCPU`
+* `confidenceIntervalMemory`
+* `recommendationMarginFraction`
+* Other parameters that benefit from workload-specific tuning
+
+### Feature Enablement and Rollback
+
+#### How can this feature be enabled / disabled in a live cluster?
+
+- Feature gate name: `PerVPAConfig`
+- Default: Off (Alpha) 
+- Components depending on the feature gate:
+  - admission-controller
+  - recommender
+  - updater
+
+The feature gate will remain in alpha (default off) until:
+- All planned configuration parameters have been implemented and tested
+- Performance impact has been thoroughly evaluated
+- Documentation is complete for all parameters
+
+Disabling of feature gate `PerVPAConfig` will cause the following to happen:
+
+- Any per-VPA configuration parameters specified in VPA objects will be ignored
+- Components will fall back to using their global configuration values
+
+Enabling of feature gate `PerVPAConfig` will cause the following to happen:
+
+- VPA components will honor the per-VPA configuration parameters specified in VPA objects
+- Validation will be performed on the configuration parameters
+- Configuration parameters will override global defaults for the specific VPA object
+
+### Kubernetes version compatibility
+
+The `PerVPAConfig` feature requires VPA version 1.5.0 or higher. The feature is being introduced as alpha and will follow the standard Kubernetes feature gate graduation process:
+- Alpha: v1.5.0 (default off)
+- Beta: TBD (default on)
+- GA: TBD (default on)
+
+### Validation via CEL and Testing
+
+Initial validation rules (CEL):
+* `oomMinBumpUp` > 0
+* `memoryAggregationInterval` > 0
+* `evictAfterOOMThreshold` > 0
+* `memoryAggregationIntervalCount` > 0
+
+Validation via Admission Controller:
+Some components cann't be validated using Common Expression Language (CEL). This validation is performed within the admission controller.
+
+* `oomBumpUpRatio` – Using Kubernetes Quantity type for validation. The value must be greater than 1.
+
+Additional validation rules will be added as new parameters are introduced.
+E2E tests will be included to verify:
+* Different configurations are properly applied and respected by VPA components
+* VPA behavior matches expected outcomes for different parameter combinations
+* Proper fallback to global configuration when parameters are not specified
+
+### Test Plan
+
+- Unit tests for new API fields and validation
+- Integration tests verifying different configurations are properly applied
+- E2E tests comparing behavior with different configurations
+- Upgrade tests ensuring backward compatibility
+
+## Implementation History
+
+- 2025-04-12: Initial proposal
+- Future: Additional parameters will be added based on user feedback and requirements
+
+## Future Work
+
+This enhancement is designed to be extensible. As the VPA evolves and users provide feedback, additional parameters may be added to the per-object configuration. Each new parameter will:
+1. Be documented in this AEP
+2. Include appropriate validation rules
+3. Maintain backward compatibility
+4. Follow the same pattern of falling back to global configuration when not specified
+
+The decision to add new parameters will be based on:
+- User feedback and requirements
+- Performance impact analysis
+- Implementation complexity
+- Maintenance considerations
+
+## Alternatives
+
+### Multiple VPA Deployments
+
+Continue with current approach of running multiple VPA deployments with different configurations:
+- Pros: No API changes needed
+- Cons: Higher resource usage, operational complexity
+
+### Environment-Specific Configuration
+
+Use different VPA deployments per environment (dev/staging/prod):
+- Pros: Simpler than per-workload configuration
+- Cons: Less flexible, doesn't address varying needs within same environment
--- a/vertical-pod-autoscaler/go.mod
+++ b/vertical-pod-autoscaler/go.mod
@ -41,6 +41,7 @@ require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/mailru/easyjson v0.9.0 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
--- a/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go
+++ b/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go
@ -81,6 +81,7 @@ type ClusterStateFeederFactory struct {
 	KubeClient          kube_client.Interface
 	MetricsClient       metrics.MetricsClient
 	VpaCheckpointClient vpa_api.VerticalPodAutoscalerCheckpointsGetter
+	VpaCheckpointLister vpa_lister.VerticalPodAutoscalerCheckpointLister
 	VpaLister           vpa_lister.VerticalPodAutoscalerLister
 	PodLister           v1lister.PodLister
 	OOMObserver         oom.Observer
@ -99,6 +100,7 @@ func (m ClusterStateFeederFactory) Make() *clusterStateFeeder {
 		metricsClient:       m.MetricsClient,
 		oomChan:             m.OOMObserver.GetObservedOomsChannel(),
 		vpaCheckpointClient: m.VpaCheckpointClient,
+		vpaCheckpointLister: m.VpaCheckpointLister,
 		vpaLister:           m.VpaLister,
 		clusterState:        m.ClusterState,
 		specClient:          spec.NewSpecClient(m.PodLister),
@ -206,6 +208,7 @@ type clusterStateFeeder struct {
 	metricsClient       metrics.MetricsClient
 	oomChan             <-chan oom.OomInfo
 	vpaCheckpointClient vpa_api.VerticalPodAutoscalerCheckpointsGetter
+	vpaCheckpointLister vpa_lister.VerticalPodAutoscalerCheckpointLister
 	vpaLister           vpa_lister.VerticalPodAutoscalerLister
 	clusterState        model.ClusterState
 	selectorFetcher     target.VpaTargetSelectorFetcher
@ -267,25 +270,29 @@ func (feeder *clusterStateFeeder) InitFromCheckpoints(ctx context.Context) {
 	klog.V(3).InfoS("Initializing VPA from checkpoints")
 	feeder.LoadVPAs(ctx)

+	klog.V(3).InfoS("Fetching VPA checkpoints")
+	checkpointList, err := feeder.vpaCheckpointLister.List(labels.Everything())
+	if err != nil {
+		klog.ErrorS(err, "Cannot list VPA checkpoints")
+	}
+
 	namespaces := make(map[string]bool)
 	for _, v := range feeder.clusterState.VPAs() {
 		namespaces[v.ID.Namespace] = true
 	}

 	for namespace := range namespaces {
-		klog.V(3).InfoS("Fetching checkpoints", "namespace", namespace)
-		checkpointList, err := feeder.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(namespace).List(ctx, metav1.ListOptions{})
-		if err != nil {
-			klog.ErrorS(err, "Cannot list VPA checkpoints", "namespace", namespace)
+		if feeder.shouldIgnoreNamespace(namespace) {
+			klog.V(3).InfoS("Skipping loading VPA Checkpoints from namespace.", "namespace", namespace, "vpaObjectNamespace", feeder.vpaObjectNamespace, "ignoredNamespaces", feeder.ignoredNamespaces)
+			continue
 		}
-		for _, checkpoint := range checkpointList.Items {

+		for _, checkpoint := range checkpointList {
 			klog.V(3).InfoS("Loading checkpoint for VPA", "checkpoint", klog.KRef(checkpoint.Namespace, checkpoint.Spec.VPAObjectName), "container", checkpoint.Spec.ContainerName)
-			err = feeder.setVpaCheckpoint(&checkpoint)
+			err = feeder.setVpaCheckpoint(checkpoint)
 			if err != nil {
 				klog.ErrorS(err, "Error while loading checkpoint")
 			}
-
 		}
 	}
 }
@ -345,11 +352,12 @@ func (feeder *clusterStateFeeder) shouldIgnoreNamespace(namespace string) bool {

 func (feeder *clusterStateFeeder) cleanupCheckpointsForNamespace(ctx context.Context, namespace string, allVPAKeys map[model.VpaID]bool) error {
 	var err error
-	checkpointList, err := feeder.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(namespace).List(ctx, metav1.ListOptions{})
+	checkpointList, err := feeder.vpaCheckpointLister.VerticalPodAutoscalerCheckpoints(namespace).List(labels.Everything())
+
 	if err != nil {
 		return err
 	}
-	for _, checkpoint := range checkpointList.Items {
+	for _, checkpoint := range checkpointList {
 		vpaID := model.VpaID{Namespace: checkpoint.Namespace, VpaName: checkpoint.Spec.VPAObjectName}
 		if !allVPAKeys[vpaID] {
 			if errFeeder := feeder.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(namespace).Delete(ctx, checkpoint.Name, metav1.DeleteOptions{}); errFeeder != nil {
@ -573,6 +581,9 @@ func (feeder *clusterStateFeeder) validateTargetRef(ctx context.Context, vpa *vp
 	if vpa.Spec.TargetRef == nil {
 		return false, condition{}
 	}
+
+	target := fmt.Sprintf("%s.%s/%s", vpa.Spec.TargetRef.APIVersion, vpa.Spec.TargetRef.Kind, vpa.Spec.TargetRef.Name)
+
 	k := controllerfetcher.ControllerKeyWithAPIVersion{
 		ControllerKey: controllerfetcher.ControllerKey{
 			Namespace: vpa.Namespace,
@ -583,13 +594,13 @@ func (feeder *clusterStateFeeder) validateTargetRef(ctx context.Context, vpa *vp
 	}
 	top, err := feeder.controllerFetcher.FindTopMostWellKnownOrScalable(ctx, &k)
 	if err != nil {
-		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: fmt.Sprintf("Error checking if target is a topmost well-known or scalable controller: %s", err)}
+		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: fmt.Sprintf("Error checking if target %s is a topmost well-known or scalable controller: %s", target, err)}
 	}
 	if top == nil {
-		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: fmt.Sprintf("Unknown error during checking if target is a topmost well-known or scalable controller: %s", err)}
+		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: fmt.Sprintf("Unknown error during checking if target %s is a topmost well-known or scalable controller", target)}
 	}
 	if *top != k {
-		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: "The targetRef controller has a parent but it should point to a topmost well-known or scalable controller"}
+		return false, condition{conditionType: vpa_types.ConfigUnsupported, delete: false, message: fmt.Sprintf("The target %s has a parent controller but it should point to a topmost well-known or scalable controller", target)}
 	}
 	return true, condition{}
 }
--- a/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder_test.go
+++ b/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder_test.go
@ -65,8 +65,8 @@ var (
 	unsupportedConditionTextFromFetcher = "Cannot read targetRef. Reason: targetRef not defined"
 	unsupportedConditionNoExtraText     = "Cannot read targetRef"
 	unsupportedConditionNoTargetRef     = "Cannot read targetRef"
-	unsupportedConditionMudaMudaMuda    = "Error checking if target is a topmost well-known or scalable controller: muda muda muda"
-	unsupportedTargetRefHasParent       = "The targetRef controller has a parent but it should point to a topmost well-known or scalable controller"
+	unsupportedConditionMudaMudaMuda    = "Error checking if target taxonomy.doestar/doseph-doestar is a topmost well-known or scalable controller: muda muda muda"
+	unsupportedTargetRefHasParent       = "The target stardust.dodokind/dotaro has a parent controller but it should point to a topmost well-known or scalable controller"
 )

 const (
@ -859,11 +859,12 @@ func TestFilterVPAsIgnoreNamespaces(t *testing.T) {
 func TestCanCleanupCheckpoints(t *testing.T) {
 	_, tctx := ktesting.NewTestContext(t)
 	client := fake.NewSimpleClientset()
+	namespace := "testNamespace"

-	_, err := client.CoreV1().Namespaces().Create(tctx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "testNamespace"}}, metav1.CreateOptions{})
+	_, err := client.CoreV1().Namespaces().Create(tctx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}}, metav1.CreateOptions{})
 	assert.NoError(t, err)

-	vpaBuilder := test.VerticalPodAutoscaler().WithContainer("container").WithNamespace("testNamespace").WithTargetRef(&autoscalingv1.CrossVersionObjectReference{
+	vpaBuilder := test.VerticalPodAutoscaler().WithContainer("container").WithNamespace(namespace).WithTargetRef(&autoscalingv1.CrossVersionObjectReference{
 		Kind:       kind,
 		Name:       name1,
 		APIVersion: apiVersion,
@ -878,22 +879,19 @@ func TestCanCleanupCheckpoints(t *testing.T) {
 	vpaLister := &test.VerticalPodAutoscalerListerMock{}
 	vpaLister.On("List").Return(vpas, nil)

-	checkpoints := &vpa_types.VerticalPodAutoscalerCheckpointList{
-		Items: []vpa_types.VerticalPodAutoscalerCheckpoint{
-			{
-				ObjectMeta: metav1.ObjectMeta{
-					Namespace: "testNamespace",
-					Name:      "nonExistentVPA",
-				},
-				Spec: vpa_types.VerticalPodAutoscalerCheckpointSpec{
-					VPAObjectName: "nonExistentVPA",
-				},
-			},
+	vpaCheckPoint := &vpa_types.VerticalPodAutoscalerCheckpoint{
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: namespace,
+			Name:      "nonExistentVPA",
+		},
+		Spec: vpa_types.VerticalPodAutoscalerCheckpointSpec{
+			VPAObjectName: "nonExistentVPA",
 		},
 	}
+	vpacheckpoints := []*vpa_types.VerticalPodAutoscalerCheckpoint{vpaCheckPoint}

 	for _, vpa := range vpas {
-		checkpoints.Items = append(checkpoints.Items, vpa_types.VerticalPodAutoscalerCheckpoint{
+		vpacheckpoints = append(vpacheckpoints, &vpa_types.VerticalPodAutoscalerCheckpoint{
 			ObjectMeta: metav1.ObjectMeta{
 				Namespace: vpa.Namespace,
 				Name:      vpa.Name,
@ -904,23 +902,29 @@ func TestCanCleanupCheckpoints(t *testing.T) {
 		})
 	}

+	// Create a mock checkpoint client to track deletions
 	checkpointClient := &fakeautoscalingv1.FakeAutoscalingV1{Fake: &core.Fake{}}
-	checkpointClient.AddReactor("list", "verticalpodautoscalercheckpoints", func(action core.Action) (bool, runtime.Object, error) {
-		return true, checkpoints, nil
-	})
-
+	// Track deleted checkpoints
 	deletedCheckpoints := []string{}
 	checkpointClient.AddReactor("delete", "verticalpodautoscalercheckpoints", func(action core.Action) (bool, runtime.Object, error) {
 		deleteAction := action.(core.DeleteAction)
 		deletedCheckpoints = append(deletedCheckpoints, deleteAction.GetName())
-
 		return true, nil, nil
 	})

+	// Create namespace lister mock that will return the checkpoint list
+	checkpointNamespaceLister := &test.VerticalPodAutoscalerCheckPointListerMock{}
+	checkpointNamespaceLister.On("List").Return(vpacheckpoints, nil)
+
+	// Create main checkpoint mock that will return the namespace lister
+	checkpointLister := &test.VerticalPodAutoscalerCheckPointListerMock{}
+	checkpointLister.On("VerticalPodAutoscalerCheckpoints", namespace).Return(checkpointNamespaceLister)
+
 	feeder := clusterStateFeeder{
 		coreClient:          client.CoreV1(),
 		vpaLister:           vpaLister,
 		vpaCheckpointClient: checkpointClient,
+		vpaCheckpointLister: checkpointLister,
 		clusterState:        model.NewClusterState(testGcPeriod),
 		recommenderName:     "default",
 	}
--- a/vertical-pod-autoscaler/pkg/recommender/main.go
+++ b/vertical-pod-autoscaler/pkg/recommender/main.go
@ -297,6 +297,7 @@ func run(ctx context.Context, healthCheck *metrics.HealthCheck, commonFlag *comm
 		MetricsClient:       input_metrics.NewMetricsClient(source, commonFlag.VpaObjectNamespace, "default-metrics-client"),
 		VpaCheckpointClient: vpa_clientset.NewForConfigOrDie(config).AutoscalingV1(),
 		VpaLister:           vpa_api_util.NewVpasLister(vpa_clientset.NewForConfigOrDie(config), make(chan struct{}), commonFlag.VpaObjectNamespace),
+		VpaCheckpointLister: vpa_api_util.NewVpaCheckpointLister(vpa_clientset.NewForConfigOrDie(config), make(chan struct{}), commonFlag.VpaObjectNamespace),
 		ClusterState:        clusterState,
 		SelectorFetcher:     target.NewVpaTargetSelectorFetcher(config, kubeClient, factory),
 		MemorySaveMode:      *memorySaver,
--- a/vertical-pod-autoscaler/pkg/updater/logic/updater.go
+++ b/vertical-pod-autoscaler/pkg/updater/logic/updater.go
@ -230,7 +230,8 @@ func (u *updater) RunOnce(ctx context.Context) {
 	// to contain only Pods controlled by a VPA in auto, recreate, or inPlaceOrRecreate mode
 	for vpa, livePods := range controlledPods {
 		vpaSize := len(livePods)
-		controlledPodsCounter.Add(vpaSize, vpaSize)
+		updateMode := vpa_api_util.GetUpdateMode(vpa)
+		controlledPodsCounter.Add(vpaSize, updateMode, vpaSize)
 		creatorToSingleGroupStatsMap, podToReplicaCreatorMap, err := u.restrictionFactory.GetCreatorMaps(livePods, vpa)
 		if err != nil {
 			klog.ErrorS(err, "Failed to get creator maps")
@ -242,7 +243,6 @@ func (u *updater) RunOnce(ctx context.Context) {

 		podsForInPlace := make([]*apiv1.Pod, 0)
 		podsForEviction := make([]*apiv1.Pod, 0)
-		updateMode := vpa_api_util.GetUpdateMode(vpa)

 		if updateMode == vpa_types.UpdateModeInPlaceOrRecreate && features.Enabled(features.InPlaceOrRecreate) {
 			podsForInPlace = u.getPodsUpdateOrder(filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa)
@ -253,7 +253,7 @@ func (u *updater) RunOnce(ctx context.Context) {
 				klog.InfoS("Warning: feature gate is not enabled for this updateMode", "featuregate", features.InPlaceOrRecreate, "updateMode", vpa_types.UpdateModeInPlaceOrRecreate)
 			}
 			podsForEviction = u.getPodsUpdateOrder(filterNonEvictablePods(livePods, evictionLimiter), vpa)
-			evictablePodsCounter.Add(vpaSize, len(podsForEviction))
+			evictablePodsCounter.Add(vpaSize, updateMode, len(podsForEviction))
 		}

 		withInPlaceUpdatable := false
@ -304,7 +304,7 @@ func (u *updater) RunOnce(ctx context.Context) {
 				klog.V(0).InfoS("Eviction failed", "error", evictErr, "pod", klog.KObj(pod))
 			} else {
 				withEvicted = true
-				metrics_updater.AddEvictedPod(vpaSize)
+				metrics_updater.AddEvictedPod(vpaSize, updateMode)
 			}
 		}

@ -315,10 +315,10 @@ func (u *updater) RunOnce(ctx context.Context) {
 			vpasWithInPlaceUpdatedPodsCounter.Add(vpaSize, 1)
 		}
 		if withEvictable {
-			vpasWithEvictablePodsCounter.Add(vpaSize, 1)
+			vpasWithEvictablePodsCounter.Add(vpaSize, updateMode, 1)
 		}
 		if withEvicted {
-			vpasWithEvictedPodsCounter.Add(vpaSize, 1)
+			vpasWithEvictedPodsCounter.Add(vpaSize, updateMode, 1)
 		}
 	}
 	timer.ObserveStep("EvictPods")
--- a/vertical-pod-autoscaler/pkg/utils/metrics/updater/updater.go
+++ b/vertical-pod-autoscaler/pkg/utils/metrics/updater/updater.go
@ -22,6 +22,7 @@ import (

 	"github.com/prometheus/client_golang/prometheus"

+	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics"
 )

@ -35,13 +36,20 @@ type SizeBasedGauge struct {
 	gauge  *prometheus.GaugeVec
 }

+// UpdateModeAndSizeBasedGauge is a wrapper for incrementally recording values
+// indexed by log2(VPA size) and update mode
+type UpdateModeAndSizeBasedGauge struct {
+	values [metrics.MaxVpaSizeLog]map[vpa_types.UpdateMode]int
+	gauge  *prometheus.GaugeVec
+}
+
 var (
 	controlledCount = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Namespace: metricsNamespace,
 			Name:      "controlled_pods_total",
 			Help:      "Number of Pods controlled by VPA updater.",
-		}, []string{"vpa_size_log2"},
+		}, []string{"vpa_size_log2", "update_mode"},
 	)

 	evictableCount = prometheus.NewGaugeVec(
@ -49,7 +57,7 @@ var (
 			Namespace: metricsNamespace,
 			Name:      "evictable_pods_total",
 			Help:      "Number of Pods matching evicition criteria.",
-		}, []string{"vpa_size_log2"},
+		}, []string{"vpa_size_log2", "update_mode"},
 	)

 	evictedCount = prometheus.NewCounterVec(
@ -57,7 +65,7 @@ var (
 			Namespace: metricsNamespace,
 			Name:      "evicted_pods_total",
 			Help:      "Number of Pods evicted by Updater to apply a new recommendation.",
-		}, []string{"vpa_size_log2"},
+		}, []string{"vpa_size_log2", "update_mode"},
 	)

 	vpasWithEvictablePodsCount = prometheus.NewGaugeVec(
@ -65,7 +73,7 @@ var (
 			Namespace: metricsNamespace,
 			Name:      "vpas_with_evictable_pods_total",
 			Help:      "Number of VPA objects with at least one Pod matching evicition criteria.",
-		}, []string{"vpa_size_log2"},
+		}, []string{"vpa_size_log2", "update_mode"},
 	)

 	vpasWithEvictedPodsCount = prometheus.NewGaugeVec(
@ -73,7 +81,7 @@ var (
 			Namespace: metricsNamespace,
 			Name:      "vpas_with_evicted_pods_total",
 			Help:      "Number of VPA objects with at least one evicted Pod.",
-		}, []string{"vpa_size_log2"},
+		}, []string{"vpa_size_log2", "update_mode"},
 	)

 	inPlaceUpdatableCount = prometheus.NewGaugeVec(
@ -138,30 +146,41 @@ func newSizeBasedGauge(gauge *prometheus.GaugeVec) *SizeBasedGauge {
 	}
 }

+// newModeAndSizeBasedGauge provides a wrapper for counting items in a loop
+func newModeAndSizeBasedGauge(gauge *prometheus.GaugeVec) *UpdateModeAndSizeBasedGauge {
+	g := &UpdateModeAndSizeBasedGauge{
+		gauge: gauge,
+	}
+	for i := range g.values {
+		g.values[i] = make(map[vpa_types.UpdateMode]int)
+	}
+	return g
+}
+
 // NewControlledPodsCounter returns a wrapper for counting Pods controlled by Updater
-func NewControlledPodsCounter() *SizeBasedGauge {
-	return newSizeBasedGauge(controlledCount)
+func NewControlledPodsCounter() *UpdateModeAndSizeBasedGauge {
+	return newModeAndSizeBasedGauge(controlledCount)
 }

 // NewEvictablePodsCounter returns a wrapper for counting Pods which are matching eviction criteria
-func NewEvictablePodsCounter() *SizeBasedGauge {
-	return newSizeBasedGauge(evictableCount)
+func NewEvictablePodsCounter() *UpdateModeAndSizeBasedGauge {
+	return newModeAndSizeBasedGauge(evictableCount)
 }

 // NewVpasWithEvictablePodsCounter returns a wrapper for counting VPA objects with Pods matching eviction criteria
-func NewVpasWithEvictablePodsCounter() *SizeBasedGauge {
-	return newSizeBasedGauge(vpasWithEvictablePodsCount)
+func NewVpasWithEvictablePodsCounter() *UpdateModeAndSizeBasedGauge {
+	return newModeAndSizeBasedGauge(vpasWithEvictablePodsCount)
 }

 // NewVpasWithEvictedPodsCounter returns a wrapper for counting VPA objects with evicted Pods
-func NewVpasWithEvictedPodsCounter() *SizeBasedGauge {
-	return newSizeBasedGauge(vpasWithEvictedPodsCount)
+func NewVpasWithEvictedPodsCounter() *UpdateModeAndSizeBasedGauge {
+	return newModeAndSizeBasedGauge(vpasWithEvictedPodsCount)
 }

 // AddEvictedPod increases the counter of pods evicted by Updater, by given VPA size
-func AddEvictedPod(vpaSize int) {
+func AddEvictedPod(vpaSize int, mode vpa_types.UpdateMode) {
 	log2 := metrics.GetVpaSizeLog2(vpaSize)
-	evictedCount.WithLabelValues(strconv.Itoa(log2)).Inc()
+	evictedCount.WithLabelValues(strconv.Itoa(log2), string(mode)).Inc()
 }

 // NewInPlaceUpdatablePodsCounter returns a wrapper for counting Pods which are matching in-place update criteria
@ -203,3 +222,19 @@ func (g *SizeBasedGauge) Observe() {
 		g.gauge.WithLabelValues(strconv.Itoa(log2)).Set(float64(value))
 	}
 }
+
+// Add increases the counter for the given VPA size and VPA update mode.
+func (g *UpdateModeAndSizeBasedGauge) Add(vpaSize int, vpaUpdateMode vpa_types.UpdateMode, value int) {
+	log2 := metrics.GetVpaSizeLog2(vpaSize)
+	g.values[log2][vpaUpdateMode] += value
+}
+
+// Observe stores the recorded values into metrics object associated with the
+// wrapper
+func (g *UpdateModeAndSizeBasedGauge) Observe() {
+	for log2, valueMap := range g.values {
+		for vpaMode, value := range valueMap {
+			g.gauge.WithLabelValues(strconv.Itoa(log2), string(vpaMode)).Set(float64(value))
+		}
+	}
+}
--- a/vertical-pod-autoscaler/pkg/utils/metrics/updater/updater_test.go
+++ b/vertical-pod-autoscaler/pkg/utils/metrics/updater/updater_test.go
@ -0,0 +1,297 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package updater
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+
+	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
+)
+
+func TestAddEvictedPod(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		vpaSize int
+		mode    vpa_types.UpdateMode
+		log2    string
+	}{
+		{
+			desc:    "VPA size 5, mode Auto",
+			vpaSize: 5,
+			mode:    vpa_types.UpdateModeAuto,
+			log2:    "2",
+		},
+		{
+			desc:    "VPA size 10, mode Off",
+			vpaSize: 10,
+			mode:    vpa_types.UpdateModeOff,
+			log2:    "3",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			t.Cleanup(evictedCount.Reset)
+			AddEvictedPod(tc.vpaSize, tc.mode)
+			val := testutil.ToFloat64(evictedCount.WithLabelValues(tc.log2, string(tc.mode)))
+			if val != 1 {
+				t.Errorf("Unexpected value for evictedCount metric with labels (%s, %s): got %v, want 1", tc.log2, string(tc.mode), val)
+			}
+		})
+	}
+}
+
+func TestAddInPlaceUpdatedPod(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		vpaSize int
+		log2    string
+	}{
+		{
+			desc:    "VPA size 10",
+			vpaSize: 10,
+			log2:    "3",
+		},
+		{
+			desc:    "VPA size 1",
+			vpaSize: 1,
+			log2:    "0",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			t.Cleanup(inPlaceUpdatedCount.Reset)
+			AddInPlaceUpdatedPod(tc.vpaSize)
+			val := testutil.ToFloat64(inPlaceUpdatedCount.WithLabelValues(tc.log2))
+			if val != 1 {
+				t.Errorf("Unexpected value for InPlaceUpdatedPod metric with labels (%s): got %v, want 1", tc.log2, val)
+			}
+		})
+	}
+}
+
+func TestRecordFailedInPlaceUpdate(t *testing.T) {
+	testCases := []struct {
+		desc    string
+		vpaSize int
+		reason  string
+		log2    string
+	}{
+		{
+			desc:    "VPA size 2, some reason",
+			vpaSize: 2,
+			reason:  "some_reason",
+			log2:    "1",
+		},
+		{
+			desc:    "VPA size 20, another reason",
+			vpaSize: 20,
+			reason:  "another_reason",
+			log2:    "4",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			t.Cleanup(failedInPlaceUpdateAttempts.Reset)
+			RecordFailedInPlaceUpdate(tc.vpaSize, tc.reason)
+			val := testutil.ToFloat64(failedInPlaceUpdateAttempts.WithLabelValues(tc.log2, tc.reason))
+			if val != 1 {
+				t.Errorf("Unexpected value for FailedInPlaceUpdate metric with labels (%s, %s): got %v, want 1", tc.log2, tc.reason, val)
+			}
+		})
+	}
+}
+
+func TestUpdateModeAndSizeBasedGauge(t *testing.T) {
+	type addition struct {
+		vpaSize int
+		mode    vpa_types.UpdateMode
+		value   int
+	}
+	type expectation struct {
+		labels []string
+		value  float64
+	}
+	testCases := []struct {
+		desc            string
+		newCounter      func() *UpdateModeAndSizeBasedGauge
+		metric          *prometheus.GaugeVec
+		metricName      string
+		additions       []addition
+		expectedMetrics []expectation
+	}{
+		{
+			desc:       "ControlledPodsCounter",
+			newCounter: NewControlledPodsCounter,
+			metric:     controlledCount,
+			metricName: "vpa_updater_controlled_pods_total",
+			additions: []addition{
+				{1, vpa_types.UpdateModeAuto, 5},
+				{2, vpa_types.UpdateModeOff, 10},
+				{2, vpa_types.UpdateModeAuto, 2},
+				{2, vpa_types.UpdateModeAuto, 7},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"0" /* log2(1) */, "Auto"}, 5},
+				{[]string{"1" /* log2(2) */, "Auto"}, 9},
+				{[]string{"1" /* log2(2) */, "Off"}, 10},
+			},
+		},
+		{
+			desc:       "EvictablePodsCounter",
+			newCounter: NewEvictablePodsCounter,
+			metric:     evictableCount,
+			metricName: "vpa_updater_evictable_pods_total",
+			additions: []addition{
+				{4, vpa_types.UpdateModeAuto, 3},
+				{1, vpa_types.UpdateModeRecreate, 8},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"2" /* log2(4) */, "Auto"}, 3},
+				{[]string{"0" /* log2(1) */, "Recreate"}, 8},
+			},
+		},
+		{
+			desc:       "VpasWithEvictablePodsCounter",
+			newCounter: NewVpasWithEvictablePodsCounter,
+			metric:     vpasWithEvictablePodsCount,
+			metricName: "vpa_updater_vpas_with_evictable_pods_total",
+			additions: []addition{
+				{1, vpa_types.UpdateModeOff, 1},
+				{2, vpa_types.UpdateModeAuto, 1},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"0" /* log2(1) */, "Off"}, 1},
+				{[]string{"1" /* log2(2) */, "Auto"}, 1},
+			},
+		},
+		{
+			desc:       "VpasWithEvictedPodsCounter",
+			newCounter: NewVpasWithEvictedPodsCounter,
+			metric:     vpasWithEvictedPodsCount,
+			metricName: "vpa_updater_vpas_with_evicted_pods_total",
+			additions: []addition{
+				{1, vpa_types.UpdateModeAuto, 2},
+				{1, vpa_types.UpdateModeAuto, 3},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"0" /* log2(1) */, "Auto"}, 5},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			t.Cleanup(tc.metric.Reset)
+			counter := tc.newCounter()
+			for _, add := range tc.additions {
+				counter.Add(add.vpaSize, add.mode, add.value)
+			}
+			counter.Observe()
+			for _, expected := range tc.expectedMetrics {
+				val := testutil.ToFloat64(tc.metric.WithLabelValues(expected.labels...))
+				if val != expected.value {
+					t.Errorf("Unexpected value for metric %s with labels %v: got %v, want %v", tc.metricName, expected.labels, val, expected.value)
+				}
+			}
+		})
+	}
+}
+
+func TestSizeBasedGauge(t *testing.T) {
+	type addition struct {
+		vpaSize int
+		value   int
+	}
+	type expectation struct {
+		labels []string
+		value  float64
+	}
+	testCases := []struct {
+		desc            string
+		newCounter      func() *SizeBasedGauge
+		metric          *prometheus.GaugeVec
+		metricName      string
+		additions       []addition
+		expectedMetrics []expectation
+	}{
+		{
+			desc:       "InPlaceUpdatablePodsCounter",
+			newCounter: NewInPlaceUpdatablePodsCounter,
+			metric:     inPlaceUpdatableCount,
+			metricName: "vpa_updater_in_place_updatable_pods_total",
+			additions: []addition{
+				{1, 5},
+				{2, 10},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"0" /* log2(1) */}, 5},
+				{[]string{"1" /* log2(2) */}, 10},
+			},
+		},
+		{
+			desc:       "VpasWithInPlaceUpdatablePodsCounter",
+			newCounter: NewVpasWithInPlaceUpdatablePodsCounter,
+			metric:     vpasWithInPlaceUpdatablePodsCount,
+			metricName: "vpa_updater_vpas_with_in_place_updatable_pods_total",
+			additions: []addition{
+				{10, 1},
+				{20, 1},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"3" /* log2(10) */}, 1},
+				{[]string{"4" /* log2(20) */}, 1},
+			},
+		},
+		{
+			desc:       "VpasWithInPlaceUpdatedPodsCounter",
+			newCounter: NewVpasWithInPlaceUpdatedPodsCounter,
+			metric:     vpasWithInPlaceUpdatedPodsCount,
+			metricName: "vpa_updater_vpas_with_in_place_updated_pods_total",
+			additions: []addition{
+				{2, 4},
+				{4, 5},
+			},
+			expectedMetrics: []expectation{
+				{[]string{"1" /* log2(2) */}, 4},
+				{[]string{"2" /* log2(4) */}, 5},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			t.Cleanup(tc.metric.Reset)
+			counter := tc.newCounter()
+			for _, add := range tc.additions {
+				counter.Add(add.vpaSize, add.value)
+			}
+			counter.Observe()
+			for _, expected := range tc.expectedMetrics {
+				val := testutil.ToFloat64(tc.metric.WithLabelValues(expected.labels...))
+				if val != expected.value {
+					t.Errorf("Unexpected value for metric %s with labels %v: got %v, want %v", tc.metricName, expected.labels, val, expected.value)
+				}
+			}
+		})
+	}
+}
--- a/vertical-pod-autoscaler/pkg/utils/test/test_utils.go
+++ b/vertical-pod-autoscaler/pkg/utils/test/test_utils.go
@ -200,6 +200,36 @@ func (m *VerticalPodAutoscalerListerMock) Get(name string) (*vpa_types.VerticalP
 	return nil, fmt.Errorf("unimplemented")
 }

+// VerticalPodAutoscalerCheckPointListerMock is a mock of VerticalPodAutoscalerCheckPointLister
+type VerticalPodAutoscalerCheckPointListerMock struct {
+	mock.Mock
+}
+
+// List is a mock implementation of VerticalPodAutoscalerLister.List
+func (m *VerticalPodAutoscalerCheckPointListerMock) List(selector labels.Selector) (ret []*vpa_types.VerticalPodAutoscalerCheckpoint, err error) {
+	args := m.Called()
+	var returnArg []*vpa_types.VerticalPodAutoscalerCheckpoint
+	if args.Get(0) != nil {
+		returnArg = args.Get(0).([]*vpa_types.VerticalPodAutoscalerCheckpoint)
+	}
+	return returnArg, args.Error(1)
+}
+
+// VerticalPodAutoscalerCheckpoints is a mock implementation of returning a lister for namespace.
+func (m *VerticalPodAutoscalerCheckPointListerMock) VerticalPodAutoscalerCheckpoints(namespace string) vpa_lister.VerticalPodAutoscalerCheckpointNamespaceLister {
+	args := m.Called(namespace)
+	var returnArg vpa_lister.VerticalPodAutoscalerCheckpointNamespaceLister
+	if args.Get(0) != nil {
+		returnArg = args.Get(0).(vpa_lister.VerticalPodAutoscalerCheckpointNamespaceLister)
+	}
+	return returnArg
+}
+
+// Get is not implemented for this mock
+func (m *VerticalPodAutoscalerCheckPointListerMock) Get(name string) (*vpa_types.VerticalPodAutoscalerCheckpoint, error) {
+	return nil, fmt.Errorf("unimplemented")
+}
+
 // VerticalPodAutoscalerV1Beta1ListerMock is a mock of VerticalPodAutoscalerLister or
 // VerticalPodAutoscalerNamespaceLister - the crucial List method is the same.
 type VerticalPodAutoscalerV1Beta1ListerMock struct {
--- a/vertical-pod-autoscaler/pkg/utils/vpa/api.go
+++ b/vertical-pod-autoscaler/pkg/utils/vpa/api.go
@ -107,6 +107,36 @@ func NewVpasLister(vpaClient *vpa_clientset.Clientset, stopChannel <-chan struct
 	return vpaLister
 }

+// NewVpaCheckpointLister returns VerticalPodAutoscalerCheckpointLister configured to fetch all VPACheckpoint objects from namespace,
+// set namespace to k8sapiv1.NamespaceAll to select all namespaces.
+// The method blocks until vpaCheckpointLister is initially populated.
+func NewVpaCheckpointLister(vpaClient *vpa_clientset.Clientset, stopChannel <-chan struct{}, namespace string) vpa_lister.VerticalPodAutoscalerCheckpointLister {
+	vpaListWatch := cache.NewListWatchFromClient(vpaClient.AutoscalingV1().RESTClient(), "verticalpodautoscalercheckpoints", namespace, fields.Everything())
+	informerOptions := cache.InformerOptions{
+		ObjectType:    &vpa_types.VerticalPodAutoscalerCheckpoint{},
+		ListerWatcher: vpaListWatch,
+		Handler:       &cache.ResourceEventHandlerFuncs{},
+		ResyncPeriod:  1 * time.Hour,
+		Indexers:      cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc},
+	}
+
+	store, controller := cache.NewInformerWithOptions(informerOptions)
+	indexer, ok := store.(cache.Indexer)
+	if !ok {
+		klog.ErrorS(nil, "Expected Indexer, but got a Store that does not implement Indexer")
+		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
+	}
+	vpaCheckpointLister := vpa_lister.NewVerticalPodAutoscalerCheckpointLister(indexer)
+	go controller.Run(stopChannel)
+	if !cache.WaitForCacheSync(stopChannel, controller.HasSynced) {
+		klog.ErrorS(nil, "Failed to sync VPA checkpoint cache during initialization")
+		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
+	} else {
+		klog.InfoS("Initial VPA checkpoint synced successfully")
+	}
+	return vpaCheckpointLister
+}
+
 // PodMatchesVPA returns true iff the vpaWithSelector matches the Pod.
 func PodMatchesVPA(pod *core.Pod, vpaWithSelector *VpaWithSelector) bool {
 	return PodLabelsMatchVPA(pod.Namespace, labels.Set(pod.GetLabels()), vpaWithSelector.Vpa.Namespace, vpaWithSelector.Selector)
Author	SHA1	Message	Date
Kubernetes Prow Robot	c7a51426e2	Merge pull request #8395 from laoj2/add-label-to-vpa-metric Add update_mode label to VPA updater metrics	2025-08-02 02:57:37 -07:00
Kubernetes Prow Robot	c1352dad7c	Merge pull request #8297 from pat-s/feat/imagesForArch-nodepool Hetzner(feat): add option to set nodepool-specific image IDs	2025-07-31 23:47:15 -07:00
Luiz Oliveira	36804f199c	Add update_mode label to VPA updater metrics	2025-07-31 15:16:45 -04:00
Kubernetes Prow Robot	172a22c195	Merge pull request #8376 from jackfrancis/ca-maintainer-elmiko CA: add elmiko as maintainer	2025-07-31 11:01:15 -07:00
Kubernetes Prow Robot	4bd2e67a1d	Merge pull request #8331 from adrianmoisey/list Switch VPA checkpoint to use a lister	2025-07-31 02:33:15 -07:00
Kubernetes Prow Robot	72bf359268	Merge pull request #8388 from jackfrancis/ca-validate-make-release CA: add release automation validation	2025-07-30 18:18:27 -07:00
Jack Francis	26d6b38699	CA: add release automation validation	2025-07-30 17:54:44 -07:00
Kubernetes Prow Robot	ff6e93bfc3	Merge pull request #8346 from jklaw90/readiness-prob fix: VPA add readiness and liveness for vpa admission	2025-07-30 08:44:28 -07:00
Julian Lawrence	1938d3971a	updated all deployments with health check	2025-07-29 09:20:09 -07:00
Rada Dimitrova	3d748040d9	Improve error message for unknown error in `validateTargetRef` (#8299 ) * Improve error message for unknown error in * Format Unknown error better * Update vertical-pod-autoscaler/pkg/target/controller_fetcher/controller_fetcher.go Co-authored-by: Stoyan Vitanov <stoyan.a.vitanov@gmail.com> * Address PR review feedback * Remove unnecessary Stringer method * Improve third case err message * Adapt third case err message test --------- Co-authored-by: Stoyan Vitanov <stoyan.a.vitanov@gmail.com>	2025-07-29 01:38:30 -07:00
Jack Francis	2eb5adda2c	just approver	2025-07-28 09:31:05 -07:00
Jack Francis	0d36de3aa2	CA: add elmiko as maintainer	2025-07-28 07:55:24 -07:00
Kubernetes Prow Robot	1d5f0471bc	Merge pull request #8315 from vbhargav875/oci-oke-handle-ooc Handle Out of host capacity scenario in OCI nodepools	2025-07-24 23:34:27 -07:00
Vijay Bhargav Eshappa	1fbc7a9d48	Handle Out of host capacity scenario in OCI nodepools	2025-07-24 16:04:49 +05:30
Julian Lawrence	e8941e59a0	add readiness and liveness for vpa admission	2025-07-23 14:33:14 -07:00
Kubernetes Prow Robot	9a256e5c83	Merge pull request #8138 from aleskandro/patch-1 Fix typo in expander/grpcplugin/README.md	2025-07-21 12:34:27 -07:00
Maximiliano Uribe	f9b93ec395	adding env variable EnableLabelPrediction (#8324 ) * adding env variable EnableLabelPrediction * addressing comments * adding ut test and nil scenario * adding ephemeral storage ut * changing default value to true	2025-07-21 10:08:27 -07:00
Kubernetes Prow Robot	0d14eca879	Merge pull request #7993 from pierreozoux/pierreozoux-patch-2 docs(autoscaler): add details about flags	2025-07-21 07:50:28 -07:00
Adrian Moisey	aae2a010f1	Switch VPA checkpoint to use a lister	2025-07-18 15:41:46 +02:00
Omer Aplatony	c187e7f147	AEP-8026: per-vpa-component-configuration (#8026 ) * AEP: per-vpa-component-configuration Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Set proper AEP number Signed-off-by: Omer Aplatony <omerap12@gmail.com> * update AEP Signed-off-by: Omer Aplatony <omerap12@gmail.com> * remove duplicated title Signed-off-by: Omer Aplatony <omerap12@gmail.com> * fixed camel-case Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Add e2e testing Signed-off-by: Omer Aplatony <omerap12@gmail.com> * add parameter descriptions Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Add feature flag Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Updated on/off feature flag Signed-off-by: Omer Aplatony <omerap12@gmail.com> * evictAfterOomThreshold->evictAfterOOMThreshold Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Add more info Signed-off-by: Omer Aplatony <omerap12@gmail.com> * oomBumpUpRatio to int Signed-off-by: Omer Aplatony <omerap12@gmail.com> * fixed typo Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Move oomBumpUpRatio to string Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Add memoryAggregationIntervalCount Signed-off-by: Omer Aplatony <omerap12@gmail.com> * Used quantity instead of string Signed-off-by: Omer Aplatony <omerap12@gmail.com> --------- Signed-off-by: Omer Aplatony <omerap12@gmail.com>	2025-07-17 23:34:25 -07:00
Kubernetes Prow Robot	9bc422016f	Merge pull request #8027 from HadrienPatte/contributing Update go wiki URL in contributing docs	2025-07-13 13:14:22 -07:00
pat-s	008a3b916e	typo	2025-07-05 15:14:24 +02:00
pat-s	77cb4c8bf8	add files	2025-07-05 15:03:29 +02:00
Alessandro Di Stefano	96b13193e3	Fix typo in expander/grpcplugin/README.md Signed-off-by: aleskandro <aleskandro@redhat.com>	2025-05-27 15:05:30 +01:00
Pierre Ozoux	e51dcfb60b	Update cluster-autoscaler/cloudprovider/clusterapi/README.md	2025-04-17 09:18:28 +02:00
Pierre Ozoux	6eebb82f0d	Update cluster-autoscaler/cloudprovider/clusterapi/README.md	2025-04-17 09:17:44 +02:00
Hadrien Patte	9cc45e2a24	Update go wiki URL in contributing docs	2025-04-14 11:54:51 +02:00
Pierre Ozoux	8a954bc021	docs(autoscaler): add details about flags It is currently slightly confusing if you skim through the documentation. For instance, see the discussion here: https://github.com/kubernetes/autoscaler/pull/7974 I hope that by adding these 2 Important section the reader would be warned about the key difference, and need for these 2 options.	2025-03-28 15:47:14 +01:00