diff --git a/docs/addons.md b/docs/addons.md index b20ed67771..a1a36163ed 100644 --- a/docs/addons.md +++ b/docs/addons.md @@ -190,6 +190,19 @@ The kOps CLI requires additional IAM permissions to manage the requisite EventBr **Warning: If you switch between the two operating modes on an existing cluster, the old resources have to be manually deleted. For IMDS to Queue Processor, this means deleting the k8s nth daemonset. For Queue Processor to IMDS, this means deleting the k8s nth deployment and the AWS resources: the SQS queue, EventBridge rules, and ASG Lifecycle hooks.** +#### Node Problem Detector + +{{ kops_feature_table(kops_added_default='1.22') }} + +[Node Problem Detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver. + +```yaml +spec: + nodeProblemDetector: + enabled: true + memoryRequest: 32Mi + cpuRequest: 10m +``` #### Snapshot controller {{ kops_feature_table(kops_added_default='1.21', k8s_min='1.20') }} diff --git a/k8s/crds/kops.k8s.io_clusters.yaml b/k8s/crds/kops.k8s.io_clusters.yaml index d98c9031e7..3a4158ddd2 100644 --- a/k8s/crds/kops.k8s.io_clusters.yaml +++ b/k8s/crds/kops.k8s.io_clusters.yaml @@ -3965,6 +3965,51 @@ spec: items: type: string type: array + nodeProblemDetector: + description: NodeProblemDetector determines the node problem detector + configuration. + properties: + cpuLimit: + anyOf: + - type: integer + - type: string + description: 'CPULimit of NodeProblemDetector container. Default: + 10m' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + cpuRequest: + anyOf: + - type: integer + - type: string + description: 'CPURequest of NodeProblemDetector container. Default: + 10m' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + enabled: + description: 'Enabled enables the NodeProblemDetector. Default: + false' + type: boolean + image: + description: Image is the NodeProblemDetector docker container + used. + type: string + memoryLimit: + anyOf: + - type: integer + - type: string + description: 'MemoryLimit of NodeProblemDetector container. Default: + 80Mi' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + memoryRequest: + anyOf: + - type: integer + - type: string + description: 'MemoryRequest of NodeProblemDetector container. + Default: 80Mi' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object nodeTerminationHandler: description: NodeTerminationHandler determines the cluster autoscaler configuration. diff --git a/pkg/apis/kops/cluster.go b/pkg/apis/kops/cluster.go index 4e50991200..51473f99a3 100644 --- a/pkg/apis/kops/cluster.go +++ b/pkg/apis/kops/cluster.go @@ -161,6 +161,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the node termination handler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/componentconfig.go b/pkg/apis/kops/componentconfig.go index 8c52f8b937..e23c338e2c 100644 --- a/pkg/apis/kops/componentconfig.go +++ b/pkg/apis/kops/componentconfig.go @@ -888,6 +888,28 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used. + Image *string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 80Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` + // MemoryLimit of NodeProblemDetector container. + // Default: 80Mi + MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"` + // CPULimit of NodeProblemDetector container. + // Default: 10m + CPULimit *resource.Quantity `json:"cpuLimit,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/cluster.go b/pkg/apis/kops/v1alpha2/cluster.go index f96ef50b51..fdb0376bcc 100644 --- a/pkg/apis/kops/v1alpha2/cluster.go +++ b/pkg/apis/kops/v1alpha2/cluster.go @@ -160,6 +160,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the cluster autoscaler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/v1alpha2/componentconfig.go b/pkg/apis/kops/v1alpha2/componentconfig.go index 5866a5a285..799f149f92 100644 --- a/pkg/apis/kops/v1alpha2/componentconfig.go +++ b/pkg/apis/kops/v1alpha2/componentconfig.go @@ -887,6 +887,28 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used. + Image *string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 80Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` + // MemoryLimit of NodeProblemDetector container. + // Default: 80Mi + MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"` + // CPULimit of NodeProblemDetector container. + // Default: 10m + CPULimit *resource.Quantity `json:"cpuLimit,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go index 60c526f31a..61367a5af9 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go @@ -853,6 +853,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*NodeProblemDetectorConfig)(nil), (*kops.NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(a.(*NodeProblemDetectorConfig), b.(*kops.NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*kops.NodeProblemDetectorConfig)(nil), (*NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(a.(*kops.NodeProblemDetectorConfig), b.(*NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope) }); err != nil { @@ -2437,6 +2447,15 @@ func autoConvert_v1alpha2_ClusterSpec_To_kops_ClusterSpec(in *ClusterSpec, out * } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(kops.NodeProblemDetectorConfig) + if err := Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(kops.MetricsServerConfig) @@ -2840,6 +2859,15 @@ func autoConvert_kops_ClusterSpec_To_v1alpha2_ClusterSpec(in *kops.ClusterSpec, } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + if err := Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -5906,6 +5934,36 @@ func Convert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in *kops.Nod return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s) } +func autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + out.MemoryLimit = in.MemoryLimit + out.CPULimit = in.CPULimit + return nil +} + +// Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in, out, s) +} + +func autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + out.MemoryLimit = in.MemoryLimit + out.CPULimit = in.CPULimit + return nil +} + +// Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in, out, s) +} + func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error { out.Enabled = in.Enabled out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining diff --git a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go index 0378ce757f..c22429e5a3 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go @@ -1025,6 +1025,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -3934,6 +3939,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.Image != nil { + in, out := &in.Image, &out.Image + *out = new(string) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + if in.MemoryLimit != nil { + in, out := &in.MemoryLimit, &out.MemoryLimit + x := (*in).DeepCopy() + *out = &x + } + if in.CPULimit != nil { + in, out := &in.CPULimit, &out.CPULimit + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/apis/kops/zz_generated.deepcopy.go b/pkg/apis/kops/zz_generated.deepcopy.go index 4ca3dd8704..e9b14a1105 100644 --- a/pkg/apis/kops/zz_generated.deepcopy.go +++ b/pkg/apis/kops/zz_generated.deepcopy.go @@ -1109,6 +1109,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -4116,6 +4121,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.Image != nil { + in, out := &in.Image, &out.Image + *out = new(string) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + if in.MemoryLimit != nil { + in, out := &in.MemoryLimit, &out.MemoryLimit + x := (*in).DeepCopy() + *out = &x + } + if in.CPULimit != nil { + in, out := &in.CPULimit, &out.CPULimit + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/model/components/BUILD.bazel b/pkg/model/components/BUILD.bazel index b3dd01a07c..2b9e9f555e 100644 --- a/pkg/model/components/BUILD.bazel +++ b/pkg/model/components/BUILD.bazel @@ -22,6 +22,7 @@ go_library( "kubeproxy.go", "kubescheduler.go", "networking.go", + "nodeproblemdetector.go", "nodeterminationhandler.go", "openstack.go", ], diff --git a/pkg/model/components/nodeproblemdetector.go b/pkg/model/components/nodeproblemdetector.go new file mode 100644 index 0000000000..8655f1b9ee --- /dev/null +++ b/pkg/model/components/nodeproblemdetector.go @@ -0,0 +1,69 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package components + +import ( + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kops/pkg/apis/kops" + "k8s.io/kops/upup/pkg/fi" + "k8s.io/kops/upup/pkg/fi/loader" +) + +// NodeProblemDetectorOptionsBuilder adds options for the node problem detector to the model. +type NodeProblemDetectorOptionsBuilder struct { + *OptionsContext +} + +var _ loader.OptionsBuilder = &NodeProblemDetectorOptionsBuilder{} + +func (b *NodeProblemDetectorOptionsBuilder) BuildOptions(o interface{}) error { + clusterSpec := o.(*kops.ClusterSpec) + if clusterSpec.NodeProblemDetector == nil { + return nil + } + npd := clusterSpec.NodeProblemDetector + + if npd.Enabled == nil { + npd.Enabled = fi.Bool(false) + } + + if npd.CPURequest == nil { + defaultCPURequest := resource.MustParse("10m") + npd.CPURequest = &defaultCPURequest + } + + if npd.MemoryRequest == nil { + defaultMemoryRequest := resource.MustParse("80Mi") + npd.MemoryRequest = &defaultMemoryRequest + } + + if npd.CPULimit == nil { + defaultCPULimit := resource.MustParse("10m") + npd.CPULimit = &defaultCPULimit + } + + if npd.MemoryLimit == nil { + defaultMemoryLimit := resource.MustParse("80Mi") + npd.MemoryLimit = &defaultMemoryLimit + } + + if npd.Image == nil { + npd.Image = fi.String("k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.8") + } + + return nil +} diff --git a/upup/models/BUILD.bazel b/upup/models/BUILD.bazel index c7b3eb7ab1..704c9be235 100644 --- a/upup/models/BUILD.bazel +++ b/upup/models/BUILD.bazel @@ -50,6 +50,7 @@ go_library( "cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template", "cloudup/resources/addons/snapshot-controller.addons.k8s.io/k8s-1.20.yaml.template", + "cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template", ], importpath = "k8s.io/kops/upup/models", visibility = ["//visibility:public"], diff --git a/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template new file mode 100644 index 0000000000..70a85d7a4e --- /dev/null +++ b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template @@ -0,0 +1,188 @@ +{{ with .NodeProblemDetector }} +# Sourced from https://github.com/kubernetes/node-problem-detector/tree/v0.8.8 +--- +# Source: node-problem-detector/deployment/node-problem-detector.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system + labels: + app: node-problem-detector +spec: + selector: + matchLabels: + app: node-problem-detector + template: + metadata: + labels: + app: node-problem-detector + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + containers: + - name: node-problem-detector + command: + - /node-problem-detector + - --logtostderr + - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json + image: {{ .Image }} + resources: + limits: + cpu: {{ .CPULimit }} + memory: {{ .MemoryLimit }} + requests: + cpu: {{ .CPURequest }} + memory: {{ .MemoryRequest }} + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + readOnly: true + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: config + mountPath: /config + readOnly: true + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + - name: config + configMap: + name: node-problem-detector-config + items: + - key: kernel-monitor.json + path: kernel-monitor.json + - key: docker-monitor.json + path: docker-monitor.json + priorityClassName: system-node-critical + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists +--- +# Source: node-problem-detector/deployment/node-problem-detector-config.yaml +apiVersion: v1 +data: + kernel-monitor.json: | + { + "plugin": "kmsg", + "logPath": "/dev/kmsg", + "lookback": "5m", + "bufferSize": 10, + "source": "kernel-monitor", + "conditions": [ + { + "type": "KernelDeadlock", + "reason": "KernelHasNoDeadlock", + "message": "kernel has no deadlock" + }, + { + "type": "ReadonlyFilesystem", + "reason": "FilesystemIsNotReadOnly", + "message": "Filesystem is not read-only" + } + ], + "rules": [ + { + "type": "temporary", + "reason": "OOMKilling", + "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*" + }, + { + "type": "temporary", + "reason": "TaskHung", + "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "temporary", + "reason": "UnregisterNetDevice", + "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "divide error: 0000 \\[#\\d+\\] SMP" + }, + { + "type": "temporary", + "reason": "MemoryReadError", + "pattern": "CE memory read error .*" + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "AUFSUmountHung", + "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "DockerHung", + "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "ReadonlyFilesystem", + "reason": "FilesystemIsReadOnly", + "pattern": "Remounting filesystem read-only" + } + ] + } + docker-monitor.json: | + { + "plugin": "journald", + "pluginConfig": { + "source": "dockerd" + }, + "logPath": "/var/log/journal", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "conditions": [], + "rules": [ + { + "type": "temporary", + "reason": "CorruptDockerImage", + "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*" + } + ] + } +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system +{{ end }} diff --git a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go index 6c307bb887..c587e355bc 100644 --- a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go +++ b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go @@ -594,6 +594,27 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*chann } } + npd := b.Cluster.Spec.NodeProblemDetector + + if npd != nil && fi.BoolValue(npd.Enabled) { + + key := "node-problem-detector.addons.k8s.io" + version := "0.8.8" + + { + location := key + "/k8s-1.17.yaml" + id := "k8s-1.17" + + addons.Spec.Addons = append(addons.Spec.Addons, &channelsapi.AddonSpec{ + Name: fi.String(key), + Version: fi.String(version), + Selector: map[string]string{"k8s-addon": key}, + Manifest: fi.String(location), + Id: id, + }) + } + } + if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) { key := "aws-load-balancer-controller.addons.k8s.io" diff --git a/upup/pkg/fi/cloudup/populate_cluster_spec.go b/upup/pkg/fi/cloudup/populate_cluster_spec.go index 6b55a0f027..48b7e77325 100644 --- a/upup/pkg/fi/cloudup/populate_cluster_spec.go +++ b/upup/pkg/fi/cloudup/populate_cluster_spec.go @@ -280,6 +280,7 @@ func (c *populateClusterSpec) run(clientset simple.Clientset) error { codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext}) + codeModels = append(codeModels, &components.NodeProblemDetectorOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSCloudControllerManagerOptionsBuilder{OptionsContext: optionsContext}) }