Add metrics for autoprovisioning

This commit is contained in:
Maciej Pytel 2017-10-30 13:27:40 +01:00
parent 8b8599e5af
commit c376ef3c87
5 changed files with 94 additions and 0 deletions

View File

@ -261,6 +261,7 @@ func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroupName string, rea
// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the statss
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, currentTime time.Time) error {
csr.updateNodeGroupMetrics()
targetSizes, err := getTargetSizes(csr.cloudProvider)
if err != nil {
return err
@ -362,6 +363,23 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
return true
}
// updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics
func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
autoscaled := 0
autoprovisioned := 0
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
if !nodeGroup.Exist() {
continue
}
if nodeGroup.Autoprovisioned() {
autoprovisioned += 1
} else {
autoscaled += 1
}
}
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}
// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroupName string, now time.Time) bool {
if !csr.IsNodeGroupHealthy(nodeGroupName) {

View File

@ -837,10 +837,12 @@ func cleanUpNodeAutoprovisionedGroups(cloudProvider cloudprovider.CloudProvider,
if err := nodeGroup.Delete(); err != nil {
logRecorder.Eventf(apiv1.EventTypeWarning, "FailedToDeleteNodeGroup",
"NodeAutoprovisioning: attempt to delete node group %v failed: %v", ngId, err)
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
return err
}
logRecorder.Eventf(apiv1.EventTypeNormal, "DeletedNodeGroup",
"NodeAutoprovisioning: removed node group %v", ngId)
metrics.RegisterNodeGroupDeletion()
}
}
return nil

View File

@ -216,6 +216,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
if err != nil {
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToCreateNodeGroup",
"NodeAutoprovisioning: attempt to create node group %v failed: %v", oldId, err)
// TODO(maciekpytel): add some metric here after figuring out failure scenarios
return false, errors.ToAutoscalerError(errors.CloudProviderError, err)
}
newId := bestOption.NodeGroup.Id()
@ -228,6 +229,7 @@ func ScaleUp(context *AutoscalingContext, unschedulablePods []*apiv1.Pod, nodes
}
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "CreatedNodeGroup",
"NodeAutoprovisioning: created new node group %v", newId)
metrics.RegisterNodeGroupCreation()
}
}

View File

@ -235,6 +235,7 @@ func run(healthCheck *metrics.HealthCheck) {
kubeClient := createKubeClient()
kubeEventRecorder := kube_util.CreateEventRecorder(kubeClient)
opts := createAutoscalerOptions()
metrics.UpdateNapEnabled(opts.NodeAutoprovisioningEnabled)
predicateCheckerStopChannel := make(chan struct{})
predicateChecker, err := simulator.NewPredicateChecker(kubeClient, predicateCheckerStopChannel)
if err != nil {

View File

@ -35,6 +35,9 @@ type FailedScaleUpReason string
// we measure duration
type FunctionLabel string
// NodeGroupType describes node group relation to CA
type NodeGroupType string
const (
caNamespace = "cluster_autoscaler"
readyLabel = "ready"
@ -53,6 +56,12 @@ const (
// Timeout was encountered when trying to scale-up
Timeout FailedScaleUpReason = "timeout"
// autoscaledGroup is managed by CA
autoscaledGroup NodeGroupType = "autoscaled"
// autoprovisionedGroup have been created by CA (Node Autoprovisioning),
// is currently autoscaled and can be removed by CA if it's no longer needed
autoprovisionedGroup NodeGroupType = "autoprovisioned"
// LogLongDurationThreshold defines the duration after which long function
// duration will be logged (in addition to being counted in metric).
// This is meant to help find unexpectedly long function execution times for
@ -94,6 +103,14 @@ var (
}, []string{"state"},
)
nodeGroupsCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: caNamespace,
Name: "node_groups_count",
Help: "Number of node groups managed by CA.",
}, []string{"node_group_type"},
)
unschedulablePodsCount = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: caNamespace,
@ -168,11 +185,37 @@ var (
Help: "Number of nodes currently considered unneeded by CA.",
},
)
/**** Metrics related to NodeAutoprovisioning ****/
napEnabled = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: caNamespace,
Name: "nap_enabled",
Help: "Whether or not Node Autoprovisioning is enabled. 1 if it is, 0 otherwise.",
},
)
nodeGroupCreationCount = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "created_node_groups_total",
Help: "Number of node groups created by Node Autoprovisioning.",
},
)
nodeGroupDeletionCount = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: caNamespace,
Name: "deleted_node_groups_total",
Help: "Number of node groups deleted by Node Autoprovisioning.",
},
)
)
func init() {
prometheus.MustRegister(clusterSafeToAutoscale)
prometheus.MustRegister(nodesCount)
prometheus.MustRegister(nodeGroupsCount)
prometheus.MustRegister(unschedulablePodsCount)
prometheus.MustRegister(lastActivity)
prometheus.MustRegister(functionDuration)
@ -182,6 +225,9 @@ func init() {
prometheus.MustRegister(scaleDownCount)
prometheus.MustRegister(evictionsCount)
prometheus.MustRegister(unneededNodesCount)
prometheus.MustRegister(napEnabled)
prometheus.MustRegister(nodeGroupCreationCount)
prometheus.MustRegister(nodeGroupDeletionCount)
}
// UpdateDurationFromStart records the duration of the step identified by the
@ -222,6 +268,12 @@ func UpdateNodesCount(ready, unready, starting int) {
nodesCount.WithLabelValues(startingLabel).Set(float64(starting))
}
// UpdateNodeGroupsCount records the number of node groups managed by CA
func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) {
nodeGroupsCount.WithLabelValues(string(autoscaledGroup)).Set(float64(autoscaled))
nodeGroupsCount.WithLabelValues(string(autoprovisionedGroup)).Set(float64(autoprovisioned))
}
// UpdateUnschedulablePodsCount records number of currently unschedulable pods
func UpdateUnschedulablePodsCount(podsCount int) {
unschedulablePodsCount.Set(float64(podsCount))
@ -257,3 +309,22 @@ func RegisterEvictions(podsCount int) {
func UpdateUnneededNodesCount(nodesCount int) {
unneededNodesCount.Set(float64(nodesCount))
}
// UpdateNapEnabled records if NodeAutoprovisioning is enabled
func UpdateNapEnabled(enabled bool) {
if enabled {
napEnabled.Set(1)
} else {
napEnabled.Set(0)
}
}
// RegisterNodeGroupCreation registers node group creation
func RegisterNodeGroupCreation() {
nodeGroupCreationCount.Add(1.0)
}
// RegisterNodeGroupDeletion registers node group deletion
func RegisterNodeGroupDeletion() {
nodeGroupDeletionCount.Add(1.0)
}