Add metrics for autoprovisioning
This commit is contained in:
parent
8b8599e5af
commit
c376ef3c87
|
|
@ -261,6 +261,7 @@ func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroupName string, rea
|
|||
|
||||
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the statss
|
||||
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime time.Time) error {
|
||||
csr.updateNodeGroupMetrics()
|
||||
targetSizes, err := getTargetSizes(csr.cloudProvider)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
@ -362,6 +363,23 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
|
|||
return true
|
||||
}
|
||||
|
||||
// updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics
|
||||
func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
|
||||
autoscaled := 0
|
||||
autoprovisioned := 0
|
||||
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
||||
if !nodeGroup.Exist() {
|
||||
continue
|
||||
}
|
||||
if nodeGroup.Autoprovisioned() {
|
||||
autoprovisioned += 1
|
||||
} else {
|
||||
autoscaled += 1
|
||||
}
|
||||
}
|
||||
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
|
||||
}
|
||||
|
||||
// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
|
||||
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroupName string, now time.Time) bool {
|
||||
if !csr.IsNodeGroupHealthy(nodeGroupName) {
|
||||
|
|
|
|||
|
|
@ -837,10 +837,12 @@ func cleanUpNodeAutoprovisionedGroups(cloudProvider cloudprovider.CloudProvider,
|
|||
if err := nodeGroup.Delete(); err != nil {
|
||||
logRecorder.Eventf(apiv1.EventTypeWarning, "FailedToDeleteNodeGroup",
|
||||
"NodeAutoprovisioning: attempt to delete node group %v failed: %v", ngId, err)
|
||||
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
|
||||
return err
|
||||
}
|
||||
logRecorder.Eventf(apiv1.EventTypeNormal, "DeletedNodeGroup",
|
||||
"NodeAutoprovisioning: removed node group %v", ngId)
|
||||
metrics.RegisterNodeGroupDeletion()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
|
|
@ -216,6 +216,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
|
|||
if err != nil {
|
||||
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToCreateNodeGroup",
|
||||
"NodeAutoprovisioning: attempt to create node group %v failed: %v", oldId, err)
|
||||
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
|
||||
return false, errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
newId := bestOption.NodeGroup.Id()
|
||||
|
|
@ -228,6 +229,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
|
|||
}
|
||||
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "CreatedNodeGroup",
|
||||
"NodeAutoprovisioning: created new node group %v", newId)
|
||||
metrics.RegisterNodeGroupCreation()
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -235,6 +235,7 @@ func run(healthCheck *metrics.HealthCheck) {
|
|||
kubeClient := createKubeClient()
|
||||
kubeEventRecorder := kube_util.CreateEventRecorder(kubeClient)
|
||||
opts := createAutoscalerOptions()
|
||||
metrics.UpdateNapEnabled(opts.NodeAutoprovisioningEnabled)
|
||||
predicateCheckerStopChannel := make(chan struct{})
|
||||
predicateChecker, err := simulator.NewPredicateChecker(kubeClient, predicateCheckerStopChannel)
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@ type FailedScaleUpReason string
|
|||
// we measure duration
|
||||
type FunctionLabel string
|
||||
|
||||
// NodeGroupType describes node group relation to CA
|
||||
type NodeGroupType string
|
||||
|
||||
const (
|
||||
caNamespace = "cluster_autoscaler"
|
||||
readyLabel = "ready"
|
||||
|
|
@ -53,6 +56,12 @@ const (
|
|||
// Timeout was encountered when trying to scale-up
|
||||
Timeout FailedScaleUpReason = "timeout"
|
||||
|
||||
// autoscaledGroup is managed by CA
|
||||
autoscaledGroup NodeGroupType = "autoscaled"
|
||||
// autoprovisionedGroup have been created by CA (Node Autoprovisioning),
|
||||
// is currently autoscaled and can be removed by CA if it's no longer needed
|
||||
autoprovisionedGroup NodeGroupType = "autoprovisioned"
|
||||
|
||||
// LogLongDurationThreshold defines the duration after which long function
|
||||
// duration will be logged (in addition to being counted in metric).
|
||||
// This is meant to help find unexpectedly long function execution times for
|
||||
|
|
@ -94,6 +103,14 @@ var (
|
|||
}, []string{"state"},
|
||||
)
|
||||
|
||||
nodeGroupsCount = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "node_groups_count",
|
||||
Help: "Number of node groups managed by CA.",
|
||||
}, []string{"node_group_type"},
|
||||
)
|
||||
|
||||
unschedulablePodsCount = prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
|
|
@ -168,11 +185,37 @@ var (
|
|||
Help: "Number of nodes currently considered unneeded by CA.",
|
||||
},
|
||||
)
|
||||
|
||||
/**** Metrics related to NodeAutoprovisioning ****/
|
||||
napEnabled = prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "nap_enabled",
|
||||
Help: "Whether or not Node Autoprovisioning is enabled. 1 if it is, 0 otherwise.",
|
||||
},
|
||||
)
|
||||
|
||||
nodeGroupCreationCount = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "created_node_groups_total",
|
||||
Help: "Number of node groups created by Node Autoprovisioning.",
|
||||
},
|
||||
)
|
||||
|
||||
nodeGroupDeletionCount = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "deleted_node_groups_total",
|
||||
Help: "Number of node groups deleted by Node Autoprovisioning.",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(clusterSafeToAutoscale)
|
||||
prometheus.MustRegister(nodesCount)
|
||||
prometheus.MustRegister(nodeGroupsCount)
|
||||
prometheus.MustRegister(unschedulablePodsCount)
|
||||
prometheus.MustRegister(lastActivity)
|
||||
prometheus.MustRegister(functionDuration)
|
||||
|
|
@ -182,6 +225,9 @@ func init() {
|
|||
prometheus.MustRegister(scaleDownCount)
|
||||
prometheus.MustRegister(evictionsCount)
|
||||
prometheus.MustRegister(unneededNodesCount)
|
||||
prometheus.MustRegister(napEnabled)
|
||||
prometheus.MustRegister(nodeGroupCreationCount)
|
||||
prometheus.MustRegister(nodeGroupDeletionCount)
|
||||
}
|
||||
|
||||
// UpdateDurationFromStart records the duration of the step identified by the
|
||||
|
|
@ -222,6 +268,12 @@ func UpdateNodesCount(ready, unready, starting int) {
|
|||
nodesCount.WithLabelValues(startingLabel).Set(float64(starting))
|
||||
}
|
||||
|
||||
// UpdateNodeGroupsCount records the number of node groups managed by CA
|
||||
func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) {
|
||||
nodeGroupsCount.WithLabelValues(string(autoscaledGroup)).Set(float64(autoscaled))
|
||||
nodeGroupsCount.WithLabelValues(string(autoprovisionedGroup)).Set(float64(autoprovisioned))
|
||||
}
|
||||
|
||||
// UpdateUnschedulablePodsCount records number of currently unschedulable pods
|
||||
func UpdateUnschedulablePodsCount(podsCount int) {
|
||||
unschedulablePodsCount.Set(float64(podsCount))
|
||||
|
|
@ -257,3 +309,22 @@ func RegisterEvictions(podsCount int) {
|
|||
func UpdateUnneededNodesCount(nodesCount int) {
|
||||
unneededNodesCount.Set(float64(nodesCount))
|
||||
}
|
||||
|
||||
// UpdateNapEnabled records if NodeAutoprovisioning is enabled
|
||||
func UpdateNapEnabled(enabled bool) {
|
||||
if enabled {
|
||||
napEnabled.Set(1)
|
||||
} else {
|
||||
napEnabled.Set(0)
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterNodeGroupCreation registers node group creation
|
||||
func RegisterNodeGroupCreation() {
|
||||
nodeGroupCreationCount.Add(1.0)
|
||||
}
|
||||
|
||||
// RegisterNodeGroupDeletion registers node group deletion
|
||||
func RegisterNodeGroupDeletion() {
|
||||
nodeGroupDeletionCount.Add(1.0)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue