Merge pull request #1214 from AdityaMisra/kube_metric_job_failed_reason

Added the job failure reason in kube_job_status_failed metric
This commit is contained in:
Kubernetes Prow Robot 2020-10-14 00:23:54 -07:00 committed by GitHub
commit d55f5aced2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 12 deletions

View File

@ -10,7 +10,7 @@
| kube_job_spec_active_deadline_seconds | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_spec_active_deadline_seconds | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE |
| kube_job_status_active | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_status_active | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE |
| kube_job_status_succeeded | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_status_succeeded | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE |
| kube_job_status_failed | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_status_failed | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; <br> `reason`=&lt;failure reason&gt; | STABLE |
| kube_job_status_start_time | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_status_start_time | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE |
| kube_job_status_completion_time | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE | | kube_job_status_completion_time | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; | STABLE |
| kube_job_complete | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; <br> `condition`=&lt;true\|false\|unknown&gt; | STABLE | | kube_job_complete | Gauge | `job_name`=&lt;job-name&gt; <br> `namespace`=&lt;job-namespace&gt; <br> `condition`=&lt;true\|false\|unknown&gt; | STABLE |

View File

@ -35,6 +35,7 @@ var (
descJobLabelsName = "kube_job_labels" descJobLabelsName = "kube_job_labels"
descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels." descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels."
descJobLabelsDefaultLabels = []string{"namespace", "job_name"} descJobLabelsDefaultLabels = []string{"namespace", "job_name"}
jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"}
jobMetricFamilies = []generator.FamilyGenerator{ jobMetricFamilies = []generator.FamilyGenerator{
*generator.NewFamilyGenerator( *generator.NewFamilyGenerator(
@ -163,16 +164,48 @@ var (
), ),
*generator.NewFamilyGenerator( *generator.NewFamilyGenerator(
"kube_job_status_failed", "kube_job_status_failed",
"The number of pods which reached Phase Failed.", "The number of pods which reached Phase Failed and the reason for failure.",
metric.Gauge, metric.Gauge,
"", "",
wrapJobFunc(func(j *v1batch.Job) *metric.Family { wrapJobFunc(func(j *v1batch.Job) *metric.Family {
return &metric.Family{ var ms []*metric.Metric
Metrics: []*metric.Metric{
{ if float64(j.Status.Failed) == 0 {
Value: float64(j.Status.Failed), return &metric.Family{
Metrics: []*metric.Metric{
{
Value: float64(j.Status.Failed),
},
}, },
}, }
}
for _, condition := range j.Status.Conditions {
if condition.Type == v1batch.JobFailed {
reasonKnown := false
for _, reason := range jobFailureReasons {
reasonKnown = reasonKnown || failureReason(&condition, reason)
// for known reasons
ms = append(ms, &metric.Metric{
LabelKeys: []string{"reason"},
LabelValues: []string{reason},
Value: boolFloat64(failureReason(&condition, reason)),
})
}
// for unknown reasons
if !reasonKnown {
ms = append(ms, &metric.Metric{
LabelKeys: []string{"reason"},
LabelValues: []string{""},
Value: float64(j.Status.Failed),
})
}
}
}
return &metric.Family{
Metrics: ms,
} }
}), }),
), ),
@ -350,3 +383,10 @@ func createJobListWatch(kubeClient clientset.Interface, ns string) cache.ListerW
}, },
} }
} }
func failureReason(jc *v1batch.JobCondition, reason string) bool {
if jc == nil {
return false
}
return jc.Reason == reason
}

View File

@ -70,13 +70,13 @@ func TestJobStore(t *testing.T) {
# TYPE kube_job_status_active gauge # TYPE kube_job_status_active gauge
# HELP kube_job_status_completion_time CompletionTime represents time when the job was completed. # HELP kube_job_status_completion_time CompletionTime represents time when the job was completed.
# TYPE kube_job_status_completion_time gauge # TYPE kube_job_status_completion_time gauge
# HELP kube_job_status_failed The number of pods which reached Phase Failed. # HELP kube_job_status_failed The number of pods which reached Phase Failed and the reason for failure.
# TYPE kube_job_status_failed gauge # TYPE kube_job_status_failed gauge
# HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager. # HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager.
# TYPE kube_job_status_start_time gauge # TYPE kube_job_status_start_time gauge
# HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded. # HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded.
# TYPE kube_job_status_succeeded gauge # TYPE kube_job_status_succeeded gauge`
`
cases := []generateMetricsTestCase{ cases := []generateMetricsTestCase{
{ {
Obj: &v1batch.Job{ Obj: &v1batch.Job{
@ -183,7 +183,7 @@ func TestJobStore(t *testing.T) {
CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime}, CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime},
StartTime: &metav1.Time{Time: FailedJob1StartTime}, StartTime: &metav1.Time{Time: FailedJob1StartTime},
Conditions: []v1batch.JobCondition{ Conditions: []v1batch.JobCondition{
{Type: v1batch.JobFailed, Status: v1.ConditionTrue}, {Type: v1batch.JobFailed, Status: v1.ConditionTrue, Reason: "BackoffLimitExceeded"},
}, },
}, },
Spec: v1batch.JobSpec{ Spec: v1batch.JobSpec{
@ -204,7 +204,9 @@ func TestJobStore(t *testing.T) {
kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1 kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1
kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0 kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0
kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09 kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09
kube_job_status_failed{job_name="FailedJob1",namespace="ns1"} 1 kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="BackoffLimitExceeded"} 1
kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="DeadLineExceeded"} 0
kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="Evicted"} 0
kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09 kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09
kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0 kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0
`, `,