Merge pull request #11381 from dntosas/addons-add-npd

[addons] Introduce NodeProblemDetector
This commit is contained in:
Kubernetes Prow Robot 2021-06-17 00:58:19 -07:00 committed by GitHub
commit 559b57ea4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 547 additions and 0 deletions

View File

@ -190,6 +190,19 @@ The kOps CLI requires additional IAM permissions to manage the requisite EventBr
**Warning: If you switch between the two operating modes on an existing cluster, the old resources have to be manually deleted. For IMDS to Queue Processor, this means deleting the k8s nth daemonset. For Queue Processor to IMDS, this means deleting the k8s nth deployment and the AWS resources: the SQS queue, EventBridge rules, and ASG Lifecycle hooks.** **Warning: If you switch between the two operating modes on an existing cluster, the old resources have to be manually deleted. For IMDS to Queue Processor, this means deleting the k8s nth daemonset. For Queue Processor to IMDS, this means deleting the k8s nth deployment and the AWS resources: the SQS queue, EventBridge rules, and ASG Lifecycle hooks.**
#### Node Problem Detector
{{ kops_feature_table(kops_added_default='1.22') }}
[Node Problem Detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver.
```yaml
spec:
nodeProblemDetector:
enabled: true
memoryRequest: 32Mi
cpuRequest: 10m
```
#### Snapshot controller #### Snapshot controller
{{ kops_feature_table(kops_added_default='1.21', k8s_min='1.20') }} {{ kops_feature_table(kops_added_default='1.21', k8s_min='1.20') }}

View File

@ -4016,6 +4016,51 @@ spec:
items: items:
type: string type: string
type: array type: array
nodeProblemDetector:
description: NodeProblemDetector determines the node problem detector
configuration.
properties:
cpuLimit:
anyOf:
- type: integer
- type: string
description: 'CPULimit of NodeProblemDetector container. Default:
10m'
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
cpuRequest:
anyOf:
- type: integer
- type: string
description: 'CPURequest of NodeProblemDetector container. Default:
10m'
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
enabled:
description: 'Enabled enables the NodeProblemDetector. Default:
false'
type: boolean
image:
description: Image is the NodeProblemDetector docker container
used.
type: string
memoryLimit:
anyOf:
- type: integer
- type: string
description: 'MemoryLimit of NodeProblemDetector container. Default:
80Mi'
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
memoryRequest:
anyOf:
- type: integer
- type: string
description: 'MemoryRequest of NodeProblemDetector container.
Default: 80Mi'
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
nodeTerminationHandler: nodeTerminationHandler:
description: NodeTerminationHandler determines the cluster autoscaler description: NodeTerminationHandler determines the cluster autoscaler
configuration. configuration.

View File

@ -162,6 +162,8 @@ type ClusterSpec struct {
// NodeTerminationHandler determines the node termination handler configuration. // NodeTerminationHandler determines the node termination handler configuration.
NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"`
// NodeProblemDetector determines the node problem detector configuration.
NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"`
// MetricsServer determines the metrics server configuration. // MetricsServer determines the metrics server configuration.
MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"`
// CertManager determines the metrics server configuration. // CertManager determines the metrics server configuration.

View File

@ -888,6 +888,28 @@ type NodeTerminationHandlerConfig struct {
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
} }
// NodeProblemDetector determines the node problem detector configuration.
type NodeProblemDetectorConfig struct {
// Enabled enables the NodeProblemDetector.
// Default: false
Enabled *bool `json:"enabled,omitempty"`
// Image is the NodeProblemDetector docker container used.
Image *string `json:"image,omitempty"`
// MemoryRequest of NodeProblemDetector container.
// Default: 80Mi
MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"`
// CPURequest of NodeProblemDetector container.
// Default: 10m
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
// MemoryLimit of NodeProblemDetector container.
// Default: 80Mi
MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"`
// CPULimit of NodeProblemDetector container.
// Default: 10m
CPULimit *resource.Quantity `json:"cpuLimit,omitempty"`
}
// ClusterAutoscalerConfig determines the cluster autoscaler configuration. // ClusterAutoscalerConfig determines the cluster autoscaler configuration.
type ClusterAutoscalerConfig struct { type ClusterAutoscalerConfig struct {
// Enabled enables the cluster autoscaler. // Enabled enables the cluster autoscaler.

View File

@ -160,6 +160,8 @@ type ClusterSpec struct {
// NodeTerminationHandler determines the cluster autoscaler configuration. // NodeTerminationHandler determines the cluster autoscaler configuration.
NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"`
// NodeProblemDetector determines the node problem detector configuration.
NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"`
// MetricsServer determines the metrics server configuration. // MetricsServer determines the metrics server configuration.
MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"`
// CertManager determines the metrics server configuration. // CertManager determines the metrics server configuration.

View File

@ -887,6 +887,28 @@ type NodeTerminationHandlerConfig struct {
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
} }
// NodeProblemDetector determines the node problem detector configuration.
type NodeProblemDetectorConfig struct {
// Enabled enables the NodeProblemDetector.
// Default: false
Enabled *bool `json:"enabled,omitempty"`
// Image is the NodeProblemDetector docker container used.
Image *string `json:"image,omitempty"`
// MemoryRequest of NodeProblemDetector container.
// Default: 80Mi
MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"`
// CPURequest of NodeProblemDetector container.
// Default: 10m
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
// MemoryLimit of NodeProblemDetector container.
// Default: 80Mi
MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"`
// CPULimit of NodeProblemDetector container.
// Default: 10m
CPULimit *resource.Quantity `json:"cpuLimit,omitempty"`
}
// ClusterAutoscalerConfig determines the cluster autoscaler configuration. // ClusterAutoscalerConfig determines the cluster autoscaler configuration.
type ClusterAutoscalerConfig struct { type ClusterAutoscalerConfig struct {
// Enabled enables the cluster autoscaler. // Enabled enables the cluster autoscaler.

View File

@ -853,6 +853,16 @@ func RegisterConversions(s *runtime.Scheme) error {
}); err != nil { }); err != nil {
return err return err
} }
if err := s.AddGeneratedConversionFunc((*NodeProblemDetectorConfig)(nil), (*kops.NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(a.(*NodeProblemDetectorConfig), b.(*kops.NodeProblemDetectorConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*kops.NodeProblemDetectorConfig)(nil), (*NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(a.(*kops.NodeProblemDetectorConfig), b.(*NodeProblemDetectorConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope) return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope)
}); err != nil { }); err != nil {
@ -2461,6 +2471,15 @@ func autoConvert_v1alpha2_ClusterSpec_To_kops_ClusterSpec(in *ClusterSpec, out *
} else { } else {
out.NodeTerminationHandler = nil out.NodeTerminationHandler = nil
} }
if in.NodeProblemDetector != nil {
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
*out = new(kops.NodeProblemDetectorConfig)
if err := Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(*in, *out, s); err != nil {
return err
}
} else {
out.NodeProblemDetector = nil
}
if in.MetricsServer != nil { if in.MetricsServer != nil {
in, out := &in.MetricsServer, &out.MetricsServer in, out := &in.MetricsServer, &out.MetricsServer
*out = new(kops.MetricsServerConfig) *out = new(kops.MetricsServerConfig)
@ -2864,6 +2883,15 @@ func autoConvert_kops_ClusterSpec_To_v1alpha2_ClusterSpec(in *kops.ClusterSpec,
} else { } else {
out.NodeTerminationHandler = nil out.NodeTerminationHandler = nil
} }
if in.NodeProblemDetector != nil {
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
*out = new(NodeProblemDetectorConfig)
if err := Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(*in, *out, s); err != nil {
return err
}
} else {
out.NodeProblemDetector = nil
}
if in.MetricsServer != nil { if in.MetricsServer != nil {
in, out := &in.MetricsServer, &out.MetricsServer in, out := &in.MetricsServer, &out.MetricsServer
*out = new(MetricsServerConfig) *out = new(MetricsServerConfig)
@ -5932,6 +5960,36 @@ func Convert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in *kops.Nod
return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s) return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s)
} }
func autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error {
out.Enabled = in.Enabled
out.Image = in.Image
out.MemoryRequest = in.MemoryRequest
out.CPURequest = in.CPURequest
out.MemoryLimit = in.MemoryLimit
out.CPULimit = in.CPULimit
return nil
}
// Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig is an autogenerated conversion function.
func Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error {
return autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in, out, s)
}
func autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error {
out.Enabled = in.Enabled
out.Image = in.Image
out.MemoryRequest = in.MemoryRequest
out.CPURequest = in.CPURequest
out.MemoryLimit = in.MemoryLimit
out.CPULimit = in.CPULimit
return nil
}
// Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig is an autogenerated conversion function.
func Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error {
return autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in, out, s)
}
func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error { func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error {
out.Enabled = in.Enabled out.Enabled = in.Enabled
out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining

View File

@ -1045,6 +1045,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) {
*out = new(NodeTerminationHandlerConfig) *out = new(NodeTerminationHandlerConfig)
(*in).DeepCopyInto(*out) (*in).DeepCopyInto(*out)
} }
if in.NodeProblemDetector != nil {
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
*out = new(NodeProblemDetectorConfig)
(*in).DeepCopyInto(*out)
}
if in.MetricsServer != nil { if in.MetricsServer != nil {
in, out := &in.MetricsServer, &out.MetricsServer in, out := &in.MetricsServer, &out.MetricsServer
*out = new(MetricsServerConfig) *out = new(MetricsServerConfig)
@ -3954,6 +3959,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig {
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) {
*out = *in
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = new(bool)
**out = **in
}
if in.Image != nil {
in, out := &in.Image, &out.Image
*out = new(string)
**out = **in
}
if in.MemoryRequest != nil {
in, out := &in.MemoryRequest, &out.MemoryRequest
x := (*in).DeepCopy()
*out = &x
}
if in.CPURequest != nil {
in, out := &in.CPURequest, &out.CPURequest
x := (*in).DeepCopy()
*out = &x
}
if in.MemoryLimit != nil {
in, out := &in.MemoryLimit, &out.MemoryLimit
x := (*in).DeepCopy()
*out = &x
}
if in.CPULimit != nil {
in, out := &in.CPULimit, &out.CPULimit
x := (*in).DeepCopy()
*out = &x
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig.
func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig {
if in == nil {
return nil
}
out := new(NodeProblemDetectorConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) {
*out = *in *out = *in

View File

@ -1129,6 +1129,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) {
*out = new(NodeTerminationHandlerConfig) *out = new(NodeTerminationHandlerConfig)
(*in).DeepCopyInto(*out) (*in).DeepCopyInto(*out)
} }
if in.NodeProblemDetector != nil {
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
*out = new(NodeProblemDetectorConfig)
(*in).DeepCopyInto(*out)
}
if in.MetricsServer != nil { if in.MetricsServer != nil {
in, out := &in.MetricsServer, &out.MetricsServer in, out := &in.MetricsServer, &out.MetricsServer
*out = new(MetricsServerConfig) *out = new(MetricsServerConfig)
@ -4136,6 +4141,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig {
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) {
*out = *in
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = new(bool)
**out = **in
}
if in.Image != nil {
in, out := &in.Image, &out.Image
*out = new(string)
**out = **in
}
if in.MemoryRequest != nil {
in, out := &in.MemoryRequest, &out.MemoryRequest
x := (*in).DeepCopy()
*out = &x
}
if in.CPURequest != nil {
in, out := &in.CPURequest, &out.CPURequest
x := (*in).DeepCopy()
*out = &x
}
if in.MemoryLimit != nil {
in, out := &in.MemoryLimit, &out.MemoryLimit
x := (*in).DeepCopy()
*out = &x
}
if in.CPULimit != nil {
in, out := &in.CPULimit, &out.CPULimit
x := (*in).DeepCopy()
*out = &x
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig.
func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig {
if in == nil {
return nil
}
out := new(NodeProblemDetectorConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) {
*out = *in *out = *in

View File

@ -22,6 +22,7 @@ go_library(
"kubeproxy.go", "kubeproxy.go",
"kubescheduler.go", "kubescheduler.go",
"networking.go", "networking.go",
"nodeproblemdetector.go",
"nodeterminationhandler.go", "nodeterminationhandler.go",
"openstack.go", "openstack.go",
], ],

View File

@ -0,0 +1,69 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package components
import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/kops/pkg/apis/kops"
"k8s.io/kops/upup/pkg/fi"
"k8s.io/kops/upup/pkg/fi/loader"
)
// NodeProblemDetectorOptionsBuilder adds options for the node problem detector to the model.
type NodeProblemDetectorOptionsBuilder struct {
*OptionsContext
}
var _ loader.OptionsBuilder = &NodeProblemDetectorOptionsBuilder{}
func (b *NodeProblemDetectorOptionsBuilder) BuildOptions(o interface{}) error {
clusterSpec := o.(*kops.ClusterSpec)
if clusterSpec.NodeProblemDetector == nil {
return nil
}
npd := clusterSpec.NodeProblemDetector
if npd.Enabled == nil {
npd.Enabled = fi.Bool(false)
}
if npd.CPURequest == nil {
defaultCPURequest := resource.MustParse("10m")
npd.CPURequest = &defaultCPURequest
}
if npd.MemoryRequest == nil {
defaultMemoryRequest := resource.MustParse("80Mi")
npd.MemoryRequest = &defaultMemoryRequest
}
if npd.CPULimit == nil {
defaultCPULimit := resource.MustParse("10m")
npd.CPULimit = &defaultCPULimit
}
if npd.MemoryLimit == nil {
defaultMemoryLimit := resource.MustParse("80Mi")
npd.MemoryLimit = &defaultMemoryLimit
}
if npd.Image == nil {
npd.Image = fi.String("k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.8")
}
return nil
}

View File

@ -50,6 +50,7 @@ go_library(
"cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template",
"cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template",
"cloudup/resources/addons/snapshot-controller.addons.k8s.io/k8s-1.20.yaml.template", "cloudup/resources/addons/snapshot-controller.addons.k8s.io/k8s-1.20.yaml.template",
"cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template",
], ],
importpath = "k8s.io/kops/upup/models", importpath = "k8s.io/kops/upup/models",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],

View File

@ -0,0 +1,188 @@
{{ with .NodeProblemDetector }}
# Sourced from https://github.com/kubernetes/node-problem-detector/tree/v0.8.8
---
# Source: node-problem-detector/deployment/node-problem-detector.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-problem-detector
namespace: kube-system
labels:
app: node-problem-detector
spec:
selector:
matchLabels:
app: node-problem-detector
template:
metadata:
labels:
app: node-problem-detector
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
containers:
- name: node-problem-detector
command:
- /node-problem-detector
- --logtostderr
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
image: {{ .Image }}
resources:
limits:
cpu: {{ .CPULimit }}
memory: {{ .MemoryLimit }}
requests:
cpu: {{ .CPURequest }}
memory: {{ .MemoryRequest }}
securityContext:
privileged: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
readOnly: true
- name: kmsg
mountPath: /dev/kmsg
readOnly: true
# Make sure node problem detector is in the same timezone
# with the host.
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: config
mountPath: /config
readOnly: true
volumes:
- name: log
# Config `log` to your system log directory
hostPath:
path: /var/log/
- name: kmsg
hostPath:
path: /dev/kmsg
- name: localtime
hostPath:
path: /etc/localtime
- name: config
configMap:
name: node-problem-detector-config
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
priorityClassName: system-node-critical
tolerations:
- effect: NoSchedule
operator: Exists
- effect: NoExecute
operator: Exists
---
# Source: node-problem-detector/deployment/node-problem-detector-config.yaml
apiVersion: v1
data:
kernel-monitor.json: |
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "kernel-monitor",
"conditions": [
{
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock"
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
}
],
"rules": [
{
"type": "temporary",
"reason": "OOMKilling",
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
},
{
"type": "temporary",
"reason": "TaskHung",
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "temporary",
"reason": "UnregisterNetDevice",
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
"type": "temporary",
"reason": "MemoryReadError",
"pattern": "CE memory read error .*"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}
docker-monitor.json: |
{
"plugin": "journald",
"pluginConfig": {
"source": "dockerd"
},
"logPath": "/var/log/journal",
"lookback": "5m",
"bufferSize": 10,
"source": "docker-monitor",
"conditions": [],
"rules": [
{
"type": "temporary",
"reason": "CorruptDockerImage",
"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
}
]
}
kind: ConfigMap
metadata:
name: node-problem-detector-config
namespace: kube-system
{{ end }}

View File

@ -594,6 +594,27 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*chann
} }
} }
npd := b.Cluster.Spec.NodeProblemDetector
if npd != nil && fi.BoolValue(npd.Enabled) {
key := "node-problem-detector.addons.k8s.io"
version := "0.8.8"
{
location := key + "/k8s-1.17.yaml"
id := "k8s-1.17"
addons.Spec.Addons = append(addons.Spec.Addons, &channelsapi.AddonSpec{
Name: fi.String(key),
Version: fi.String(version),
Selector: map[string]string{"k8s-addon": key},
Manifest: fi.String(location),
Id: id,
})
}
}
if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) { if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) {
key := "aws-load-balancer-controller.addons.k8s.io" key := "aws-load-balancer-controller.addons.k8s.io"

View File

@ -279,6 +279,7 @@ func (c *populateClusterSpec) run(clientset simple.Clientset) error {
codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext})
codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext})
codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext})
codeModels = append(codeModels, &components.NodeProblemDetectorOptionsBuilder{OptionsContext: optionsContext})
codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext})
codeModels = append(codeModels, &components.AWSCloudControllerManagerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSCloudControllerManagerOptionsBuilder{OptionsContext: optionsContext})
} }