Add metrics for autoprovisioning
This commit is contained in:
parent
8b8599e5af
commit
c376ef3c87
|
|
@ -261,6 +261,7 @@ func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroupName string, rea
|
||||||
|
|
||||||
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the statss
|
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the statss
|
||||||
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime time.Time) error {
|
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime time.Time) error {
|
||||||
|
csr.updateNodeGroupMetrics()
|
||||||
targetSizes, err := getTargetSizes(csr.cloudProvider)
|
targetSizes, err := getTargetSizes(csr.cloudProvider)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
@ -362,6 +363,23 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics
|
||||||
|
func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
|
||||||
|
autoscaled := 0
|
||||||
|
autoprovisioned := 0
|
||||||
|
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
|
||||||
|
if !nodeGroup.Exist() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if nodeGroup.Autoprovisioned() {
|
||||||
|
autoprovisioned += 1
|
||||||
|
} else {
|
||||||
|
autoscaled += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
|
||||||
|
}
|
||||||
|
|
||||||
// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
|
// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
|
||||||
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroupName string, now time.Time) bool {
|
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroupName string, now time.Time) bool {
|
||||||
if !csr.IsNodeGroupHealthy(nodeGroupName) {
|
if !csr.IsNodeGroupHealthy(nodeGroupName) {
|
||||||
|
|
|
||||||
|
|
@ -837,10 +837,12 @@ func cleanUpNodeAutoprovisionedGroups(cloudProvider cloudprovider.CloudProvider,
|
||||||
if err := nodeGroup.Delete(); err != nil {
|
if err := nodeGroup.Delete(); err != nil {
|
||||||
logRecorder.Eventf(apiv1.EventTypeWarning, "FailedToDeleteNodeGroup",
|
logRecorder.Eventf(apiv1.EventTypeWarning, "FailedToDeleteNodeGroup",
|
||||||
"NodeAutoprovisioning: attempt to delete node group %v failed: %v", ngId, err)
|
"NodeAutoprovisioning: attempt to delete node group %v failed: %v", ngId, err)
|
||||||
|
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
logRecorder.Eventf(apiv1.EventTypeNormal, "DeletedNodeGroup",
|
logRecorder.Eventf(apiv1.EventTypeNormal, "DeletedNodeGroup",
|
||||||
"NodeAutoprovisioning: removed node group %v", ngId)
|
"NodeAutoprovisioning: removed node group %v", ngId)
|
||||||
|
metrics.RegisterNodeGroupDeletion()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
|
||||||
|
|
@ -216,6 +216,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
|
||||||
if err != nil {
|
if err != nil {
|
||||||
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToCreateNodeGroup",
|
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToCreateNodeGroup",
|
||||||
"NodeAutoprovisioning: attempt to create node group %v failed: %v", oldId, err)
|
"NodeAutoprovisioning: attempt to create node group %v failed: %v", oldId, err)
|
||||||
|
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
|
||||||
return false, errors.ToAutoscalerError(errors.CloudProviderError, err)
|
return false, errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||||
}
|
}
|
||||||
newId := bestOption.NodeGroup.Id()
|
newId := bestOption.NodeGroup.Id()
|
||||||
|
|
@ -228,6 +229,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
|
||||||
}
|
}
|
||||||
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "CreatedNodeGroup",
|
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "CreatedNodeGroup",
|
||||||
"NodeAutoprovisioning: created new node group %v", newId)
|
"NodeAutoprovisioning: created new node group %v", newId)
|
||||||
|
metrics.RegisterNodeGroupCreation()
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -235,6 +235,7 @@ func run(healthCheck *metrics.HealthCheck) {
|
||||||
kubeClient := createKubeClient()
|
kubeClient := createKubeClient()
|
||||||
kubeEventRecorder := kube_util.CreateEventRecorder(kubeClient)
|
kubeEventRecorder := kube_util.CreateEventRecorder(kubeClient)
|
||||||
opts := createAutoscalerOptions()
|
opts := createAutoscalerOptions()
|
||||||
|
metrics.UpdateNapEnabled(opts.NodeAutoprovisioningEnabled)
|
||||||
predicateCheckerStopChannel := make(chan struct{})
|
predicateCheckerStopChannel := make(chan struct{})
|
||||||
predicateChecker, err := simulator.NewPredicateChecker(kubeClient, predicateCheckerStopChannel)
|
predicateChecker, err := simulator.NewPredicateChecker(kubeClient, predicateCheckerStopChannel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,9 @@ type FailedScaleUpReason string
|
||||||
// we measure duration
|
// we measure duration
|
||||||
type FunctionLabel string
|
type FunctionLabel string
|
||||||
|
|
||||||
|
// NodeGroupType describes node group relation to CA
|
||||||
|
type NodeGroupType string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
caNamespace = "cluster_autoscaler"
|
caNamespace = "cluster_autoscaler"
|
||||||
readyLabel = "ready"
|
readyLabel = "ready"
|
||||||
|
|
@ -53,6 +56,12 @@ const (
|
||||||
// Timeout was encountered when trying to scale-up
|
// Timeout was encountered when trying to scale-up
|
||||||
Timeout FailedScaleUpReason = "timeout"
|
Timeout FailedScaleUpReason = "timeout"
|
||||||
|
|
||||||
|
// autoscaledGroup is managed by CA
|
||||||
|
autoscaledGroup NodeGroupType = "autoscaled"
|
||||||
|
// autoprovisionedGroup have been created by CA (Node Autoprovisioning),
|
||||||
|
// is currently autoscaled and can be removed by CA if it's no longer needed
|
||||||
|
autoprovisionedGroup NodeGroupType = "autoprovisioned"
|
||||||
|
|
||||||
// LogLongDurationThreshold defines the duration after which long function
|
// LogLongDurationThreshold defines the duration after which long function
|
||||||
// duration will be logged (in addition to being counted in metric).
|
// duration will be logged (in addition to being counted in metric).
|
||||||
// This is meant to help find unexpectedly long function execution times for
|
// This is meant to help find unexpectedly long function execution times for
|
||||||
|
|
@ -94,6 +103,14 @@ var (
|
||||||
}, []string{"state"},
|
}, []string{"state"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
nodeGroupsCount = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "node_groups_count",
|
||||||
|
Help: "Number of node groups managed by CA.",
|
||||||
|
}, []string{"node_group_type"},
|
||||||
|
)
|
||||||
|
|
||||||
unschedulablePodsCount = prometheus.NewGauge(
|
unschedulablePodsCount = prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: caNamespace,
|
Namespace: caNamespace,
|
||||||
|
|
@ -168,11 +185,37 @@ var (
|
||||||
Help: "Number of nodes currently considered unneeded by CA.",
|
Help: "Number of nodes currently considered unneeded by CA.",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/**** Metrics related to NodeAutoprovisioning ****/
|
||||||
|
napEnabled = prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "nap_enabled",
|
||||||
|
Help: "Whether or not Node Autoprovisioning is enabled. 1 if it is, 0 otherwise.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
nodeGroupCreationCount = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "created_node_groups_total",
|
||||||
|
Help: "Number of node groups created by Node Autoprovisioning.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
nodeGroupDeletionCount = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: caNamespace,
|
||||||
|
Name: "deleted_node_groups_total",
|
||||||
|
Help: "Number of node groups deleted by Node Autoprovisioning.",
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(clusterSafeToAutoscale)
|
prometheus.MustRegister(clusterSafeToAutoscale)
|
||||||
prometheus.MustRegister(nodesCount)
|
prometheus.MustRegister(nodesCount)
|
||||||
|
prometheus.MustRegister(nodeGroupsCount)
|
||||||
prometheus.MustRegister(unschedulablePodsCount)
|
prometheus.MustRegister(unschedulablePodsCount)
|
||||||
prometheus.MustRegister(lastActivity)
|
prometheus.MustRegister(lastActivity)
|
||||||
prometheus.MustRegister(functionDuration)
|
prometheus.MustRegister(functionDuration)
|
||||||
|
|
@ -182,6 +225,9 @@ func init() {
|
||||||
prometheus.MustRegister(scaleDownCount)
|
prometheus.MustRegister(scaleDownCount)
|
||||||
prometheus.MustRegister(evictionsCount)
|
prometheus.MustRegister(evictionsCount)
|
||||||
prometheus.MustRegister(unneededNodesCount)
|
prometheus.MustRegister(unneededNodesCount)
|
||||||
|
prometheus.MustRegister(napEnabled)
|
||||||
|
prometheus.MustRegister(nodeGroupCreationCount)
|
||||||
|
prometheus.MustRegister(nodeGroupDeletionCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
// UpdateDurationFromStart records the duration of the step identified by the
|
// UpdateDurationFromStart records the duration of the step identified by the
|
||||||
|
|
@ -222,6 +268,12 @@ func UpdateNodesCount(ready, unready, starting int) {
|
||||||
nodesCount.WithLabelValues(startingLabel).Set(float64(starting))
|
nodesCount.WithLabelValues(startingLabel).Set(float64(starting))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UpdateNodeGroupsCount records the number of node groups managed by CA
|
||||||
|
func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) {
|
||||||
|
nodeGroupsCount.WithLabelValues(string(autoscaledGroup)).Set(float64(autoscaled))
|
||||||
|
nodeGroupsCount.WithLabelValues(string(autoprovisionedGroup)).Set(float64(autoprovisioned))
|
||||||
|
}
|
||||||
|
|
||||||
// UpdateUnschedulablePodsCount records number of currently unschedulable pods
|
// UpdateUnschedulablePodsCount records number of currently unschedulable pods
|
||||||
func UpdateUnschedulablePodsCount(podsCount int) {
|
func UpdateUnschedulablePodsCount(podsCount int) {
|
||||||
unschedulablePodsCount.Set(float64(podsCount))
|
unschedulablePodsCount.Set(float64(podsCount))
|
||||||
|
|
@ -257,3 +309,22 @@ func RegisterEvictions(podsCount int) {
|
||||||
func UpdateUnneededNodesCount(nodesCount int) {
|
func UpdateUnneededNodesCount(nodesCount int) {
|
||||||
unneededNodesCount.Set(float64(nodesCount))
|
unneededNodesCount.Set(float64(nodesCount))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UpdateNapEnabled records if NodeAutoprovisioning is enabled
|
||||||
|
func UpdateNapEnabled(enabled bool) {
|
||||||
|
if enabled {
|
||||||
|
napEnabled.Set(1)
|
||||||
|
} else {
|
||||||
|
napEnabled.Set(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegisterNodeGroupCreation registers node group creation
|
||||||
|
func RegisterNodeGroupCreation() {
|
||||||
|
nodeGroupCreationCount.Add(1.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegisterNodeGroupDeletion registers node group deletion
|
||||||
|
func RegisterNodeGroupDeletion() {
|
||||||
|
nodeGroupDeletionCount.Add(1.0)
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue