Merge pull request #1214 from AdityaMisra/kube_metric_job_failed_reason
Added the job failure reason in kube_job_status_failed metric
This commit is contained in:
commit
d55f5aced2
|
|
@ -10,7 +10,7 @@
|
||||||
| kube_job_spec_active_deadline_seconds | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_spec_active_deadline_seconds | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
||||||
| kube_job_status_active | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_status_active | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
||||||
| kube_job_status_succeeded | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_status_succeeded | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
||||||
| kube_job_status_failed | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_status_failed | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> <br> `reason`=<failure reason> | STABLE |
|
||||||
| kube_job_status_start_time | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_status_start_time | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
||||||
| kube_job_status_completion_time | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
| kube_job_status_completion_time | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> | STABLE |
|
||||||
| kube_job_complete | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> <br> `condition`=<true\|false\|unknown> | STABLE |
|
| kube_job_complete | Gauge | `job_name`=<job-name> <br> `namespace`=<job-namespace> <br> `condition`=<true\|false\|unknown> | STABLE |
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ var (
|
||||||
descJobLabelsName = "kube_job_labels"
|
descJobLabelsName = "kube_job_labels"
|
||||||
descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels."
|
descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels."
|
||||||
descJobLabelsDefaultLabels = []string{"namespace", "job_name"}
|
descJobLabelsDefaultLabels = []string{"namespace", "job_name"}
|
||||||
|
jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"}
|
||||||
|
|
||||||
jobMetricFamilies = []generator.FamilyGenerator{
|
jobMetricFamilies = []generator.FamilyGenerator{
|
||||||
*generator.NewFamilyGenerator(
|
*generator.NewFamilyGenerator(
|
||||||
|
|
@ -163,16 +164,48 @@ var (
|
||||||
),
|
),
|
||||||
*generator.NewFamilyGenerator(
|
*generator.NewFamilyGenerator(
|
||||||
"kube_job_status_failed",
|
"kube_job_status_failed",
|
||||||
"The number of pods which reached Phase Failed.",
|
"The number of pods which reached Phase Failed and the reason for failure.",
|
||||||
metric.Gauge,
|
metric.Gauge,
|
||||||
"",
|
"",
|
||||||
wrapJobFunc(func(j *v1batch.Job) *metric.Family {
|
wrapJobFunc(func(j *v1batch.Job) *metric.Family {
|
||||||
return &metric.Family{
|
var ms []*metric.Metric
|
||||||
Metrics: []*metric.Metric{
|
|
||||||
{
|
if float64(j.Status.Failed) == 0 {
|
||||||
Value: float64(j.Status.Failed),
|
return &metric.Family{
|
||||||
|
Metrics: []*metric.Metric{
|
||||||
|
{
|
||||||
|
Value: float64(j.Status.Failed),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, condition := range j.Status.Conditions {
|
||||||
|
if condition.Type == v1batch.JobFailed {
|
||||||
|
reasonKnown := false
|
||||||
|
for _, reason := range jobFailureReasons {
|
||||||
|
reasonKnown = reasonKnown || failureReason(&condition, reason)
|
||||||
|
|
||||||
|
// for known reasons
|
||||||
|
ms = append(ms, &metric.Metric{
|
||||||
|
LabelKeys: []string{"reason"},
|
||||||
|
LabelValues: []string{reason},
|
||||||
|
Value: boolFloat64(failureReason(&condition, reason)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
// for unknown reasons
|
||||||
|
if !reasonKnown {
|
||||||
|
ms = append(ms, &metric.Metric{
|
||||||
|
LabelKeys: []string{"reason"},
|
||||||
|
LabelValues: []string{""},
|
||||||
|
Value: float64(j.Status.Failed),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &metric.Family{
|
||||||
|
Metrics: ms,
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
),
|
),
|
||||||
|
|
@ -350,3 +383,10 @@ func createJobListWatch(kubeClient clientset.Interface, ns string) cache.ListerW
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func failureReason(jc *v1batch.JobCondition, reason string) bool {
|
||||||
|
if jc == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return jc.Reason == reason
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -70,13 +70,13 @@ func TestJobStore(t *testing.T) {
|
||||||
# TYPE kube_job_status_active gauge
|
# TYPE kube_job_status_active gauge
|
||||||
# HELP kube_job_status_completion_time CompletionTime represents time when the job was completed.
|
# HELP kube_job_status_completion_time CompletionTime represents time when the job was completed.
|
||||||
# TYPE kube_job_status_completion_time gauge
|
# TYPE kube_job_status_completion_time gauge
|
||||||
# HELP kube_job_status_failed The number of pods which reached Phase Failed.
|
# HELP kube_job_status_failed The number of pods which reached Phase Failed and the reason for failure.
|
||||||
# TYPE kube_job_status_failed gauge
|
# TYPE kube_job_status_failed gauge
|
||||||
# HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager.
|
# HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager.
|
||||||
# TYPE kube_job_status_start_time gauge
|
# TYPE kube_job_status_start_time gauge
|
||||||
# HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded.
|
# HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded.
|
||||||
# TYPE kube_job_status_succeeded gauge
|
# TYPE kube_job_status_succeeded gauge`
|
||||||
`
|
|
||||||
cases := []generateMetricsTestCase{
|
cases := []generateMetricsTestCase{
|
||||||
{
|
{
|
||||||
Obj: &v1batch.Job{
|
Obj: &v1batch.Job{
|
||||||
|
|
@ -183,7 +183,7 @@ func TestJobStore(t *testing.T) {
|
||||||
CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime},
|
CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime},
|
||||||
StartTime: &metav1.Time{Time: FailedJob1StartTime},
|
StartTime: &metav1.Time{Time: FailedJob1StartTime},
|
||||||
Conditions: []v1batch.JobCondition{
|
Conditions: []v1batch.JobCondition{
|
||||||
{Type: v1batch.JobFailed, Status: v1.ConditionTrue},
|
{Type: v1batch.JobFailed, Status: v1.ConditionTrue, Reason: "BackoffLimitExceeded"},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Spec: v1batch.JobSpec{
|
Spec: v1batch.JobSpec{
|
||||||
|
|
@ -204,7 +204,9 @@ func TestJobStore(t *testing.T) {
|
||||||
kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1
|
kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1
|
||||||
kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0
|
kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0
|
||||||
kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09
|
kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09
|
||||||
kube_job_status_failed{job_name="FailedJob1",namespace="ns1"} 1
|
kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="BackoffLimitExceeded"} 1
|
||||||
|
kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="DeadLineExceeded"} 0
|
||||||
|
kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="Evicted"} 0
|
||||||
kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09
|
kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09
|
||||||
kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0
|
kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0
|
||||||
`,
|
`,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue