diff --git a/docs/addons.md b/docs/addons.md index b20ed67771..a1a36163ed 100644 --- a/docs/addons.md +++ b/docs/addons.md @@ -190,6 +190,19 @@ The kOps CLI requires additional IAM permissions to manage the requisite EventBr **Warning: If you switch between the two operating modes on an existing cluster, the old resources have to be manually deleted. For IMDS to Queue Processor, this means deleting the k8s nth daemonset. For Queue Processor to IMDS, this means deleting the k8s nth deployment and the AWS resources: the SQS queue, EventBridge rules, and ASG Lifecycle hooks.** +#### Node Problem Detector + +{{ kops_feature_table(kops_added_default='1.22') }} + +[Node Problem Detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver. + +```yaml +spec: + nodeProblemDetector: + enabled: true + memoryRequest: 32Mi + cpuRequest: 10m +``` #### Snapshot controller {{ kops_feature_table(kops_added_default='1.21', k8s_min='1.20') }} diff --git a/k8s/crds/kops.k8s.io_clusters.yaml b/k8s/crds/kops.k8s.io_clusters.yaml index 2976d0b18a..de117eb23c 100644 --- a/k8s/crds/kops.k8s.io_clusters.yaml +++ b/k8s/crds/kops.k8s.io_clusters.yaml @@ -4016,6 +4016,51 @@ spec: items: type: string type: array + nodeProblemDetector: + description: NodeProblemDetector determines the node problem detector + configuration. + properties: + cpuLimit: + anyOf: + - type: integer + - type: string + description: 'CPULimit of NodeProblemDetector container. Default: + 10m' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + cpuRequest: + anyOf: + - type: integer + - type: string + description: 'CPURequest of NodeProblemDetector container. Default: + 10m' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + enabled: + description: 'Enabled enables the NodeProblemDetector. Default: + false' + type: boolean + image: + description: Image is the NodeProblemDetector docker container + used. + type: string + memoryLimit: + anyOf: + - type: integer + - type: string + description: 'MemoryLimit of NodeProblemDetector container. Default: + 80Mi' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + memoryRequest: + anyOf: + - type: integer + - type: string + description: 'MemoryRequest of NodeProblemDetector container. + Default: 80Mi' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object nodeTerminationHandler: description: NodeTerminationHandler determines the cluster autoscaler configuration. diff --git a/pkg/apis/kops/cluster.go b/pkg/apis/kops/cluster.go index 5da6439923..93572e939b 100644 --- a/pkg/apis/kops/cluster.go +++ b/pkg/apis/kops/cluster.go @@ -162,6 +162,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the node termination handler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/componentconfig.go b/pkg/apis/kops/componentconfig.go index 8c52f8b937..e23c338e2c 100644 --- a/pkg/apis/kops/componentconfig.go +++ b/pkg/apis/kops/componentconfig.go @@ -888,6 +888,28 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used. + Image *string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 80Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` + // MemoryLimit of NodeProblemDetector container. + // Default: 80Mi + MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"` + // CPULimit of NodeProblemDetector container. + // Default: 10m + CPULimit *resource.Quantity `json:"cpuLimit,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/cluster.go b/pkg/apis/kops/v1alpha2/cluster.go index f96ef50b51..fdb0376bcc 100644 --- a/pkg/apis/kops/v1alpha2/cluster.go +++ b/pkg/apis/kops/v1alpha2/cluster.go @@ -160,6 +160,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the cluster autoscaler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/v1alpha2/componentconfig.go b/pkg/apis/kops/v1alpha2/componentconfig.go index 5866a5a285..799f149f92 100644 --- a/pkg/apis/kops/v1alpha2/componentconfig.go +++ b/pkg/apis/kops/v1alpha2/componentconfig.go @@ -887,6 +887,28 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used. + Image *string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 80Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` + // MemoryLimit of NodeProblemDetector container. + // Default: 80Mi + MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"` + // CPULimit of NodeProblemDetector container. + // Default: 10m + CPULimit *resource.Quantity `json:"cpuLimit,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go index 412043fcd2..16ac72caf1 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go @@ -853,6 +853,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*NodeProblemDetectorConfig)(nil), (*kops.NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(a.(*NodeProblemDetectorConfig), b.(*kops.NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*kops.NodeProblemDetectorConfig)(nil), (*NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(a.(*kops.NodeProblemDetectorConfig), b.(*NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope) }); err != nil { @@ -2461,6 +2471,15 @@ func autoConvert_v1alpha2_ClusterSpec_To_kops_ClusterSpec(in *ClusterSpec, out * } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(kops.NodeProblemDetectorConfig) + if err := Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(kops.MetricsServerConfig) @@ -2864,6 +2883,15 @@ func autoConvert_kops_ClusterSpec_To_v1alpha2_ClusterSpec(in *kops.ClusterSpec, } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + if err := Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -5932,6 +5960,36 @@ func Convert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in *kops.Nod return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s) } +func autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + out.MemoryLimit = in.MemoryLimit + out.CPULimit = in.CPULimit + return nil +} + +// Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in, out, s) +} + +func autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + out.MemoryLimit = in.MemoryLimit + out.CPULimit = in.CPULimit + return nil +} + +// Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in, out, s) +} + func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error { out.Enabled = in.Enabled out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining diff --git a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go index 2d12661e06..57d38f72ae 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go @@ -1045,6 +1045,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -3954,6 +3959,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.Image != nil { + in, out := &in.Image, &out.Image + *out = new(string) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + if in.MemoryLimit != nil { + in, out := &in.MemoryLimit, &out.MemoryLimit + x := (*in).DeepCopy() + *out = &x + } + if in.CPULimit != nil { + in, out := &in.CPULimit, &out.CPULimit + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/apis/kops/zz_generated.deepcopy.go b/pkg/apis/kops/zz_generated.deepcopy.go index 3a94dee76b..80dd2894cf 100644 --- a/pkg/apis/kops/zz_generated.deepcopy.go +++ b/pkg/apis/kops/zz_generated.deepcopy.go @@ -1129,6 +1129,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -4136,6 +4141,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.Image != nil { + in, out := &in.Image, &out.Image + *out = new(string) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + if in.MemoryLimit != nil { + in, out := &in.MemoryLimit, &out.MemoryLimit + x := (*in).DeepCopy() + *out = &x + } + if in.CPULimit != nil { + in, out := &in.CPULimit, &out.CPULimit + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/model/components/BUILD.bazel b/pkg/model/components/BUILD.bazel index 0fa3092c2f..4e75f19e71 100644 --- a/pkg/model/components/BUILD.bazel +++ b/pkg/model/components/BUILD.bazel @@ -22,6 +22,7 @@ go_library( "kubeproxy.go", "kubescheduler.go", "networking.go", + "nodeproblemdetector.go", "nodeterminationhandler.go", "openstack.go", ], diff --git a/pkg/model/components/nodeproblemdetector.go b/pkg/model/components/nodeproblemdetector.go new file mode 100644 index 0000000000..8655f1b9ee --- /dev/null +++ b/pkg/model/components/nodeproblemdetector.go @@ -0,0 +1,69 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package components + +import ( + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kops/pkg/apis/kops" + "k8s.io/kops/upup/pkg/fi" + "k8s.io/kops/upup/pkg/fi/loader" +) + +// NodeProblemDetectorOptionsBuilder adds options for the node problem detector to the model. +type NodeProblemDetectorOptionsBuilder struct { + *OptionsContext +} + +var _ loader.OptionsBuilder = &NodeProblemDetectorOptionsBuilder{} + +func (b *NodeProblemDetectorOptionsBuilder) BuildOptions(o interface{}) error { + clusterSpec := o.(*kops.ClusterSpec) + if clusterSpec.NodeProblemDetector == nil { + return nil + } + npd := clusterSpec.NodeProblemDetector + + if npd.Enabled == nil { + npd.Enabled = fi.Bool(false) + } + + if npd.CPURequest == nil { + defaultCPURequest := resource.MustParse("10m") + npd.CPURequest = &defaultCPURequest + } + + if npd.MemoryRequest == nil { + defaultMemoryRequest := resource.MustParse("80Mi") + npd.MemoryRequest = &defaultMemoryRequest + } + + if npd.CPULimit == nil { + defaultCPULimit := resource.MustParse("10m") + npd.CPULimit = &defaultCPULimit + } + + if npd.MemoryLimit == nil { + defaultMemoryLimit := resource.MustParse("80Mi") + npd.MemoryLimit = &defaultMemoryLimit + } + + if npd.Image == nil { + npd.Image = fi.String("k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.8") + } + + return nil +} diff --git a/upup/models/BUILD.bazel b/upup/models/BUILD.bazel index c7b3eb7ab1..704c9be235 100644 --- a/upup/models/BUILD.bazel +++ b/upup/models/BUILD.bazel @@ -50,6 +50,7 @@ go_library( "cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template", "cloudup/resources/addons/snapshot-controller.addons.k8s.io/k8s-1.20.yaml.template", + "cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template", ], importpath = "k8s.io/kops/upup/models", visibility = ["//visibility:public"], diff --git a/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template new file mode 100644 index 0000000000..70a85d7a4e --- /dev/null +++ b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template @@ -0,0 +1,188 @@ +{{ with .NodeProblemDetector }} +# Sourced from https://github.com/kubernetes/node-problem-detector/tree/v0.8.8 +--- +# Source: node-problem-detector/deployment/node-problem-detector.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system + labels: + app: node-problem-detector +spec: + selector: + matchLabels: + app: node-problem-detector + template: + metadata: + labels: + app: node-problem-detector + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + containers: + - name: node-problem-detector + command: + - /node-problem-detector + - --logtostderr + - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json + image: {{ .Image }} + resources: + limits: + cpu: {{ .CPULimit }} + memory: {{ .MemoryLimit }} + requests: + cpu: {{ .CPURequest }} + memory: {{ .MemoryRequest }} + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + readOnly: true + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: config + mountPath: /config + readOnly: true + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + - name: config + configMap: + name: node-problem-detector-config + items: + - key: kernel-monitor.json + path: kernel-monitor.json + - key: docker-monitor.json + path: docker-monitor.json + priorityClassName: system-node-critical + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists +--- +# Source: node-problem-detector/deployment/node-problem-detector-config.yaml +apiVersion: v1 +data: + kernel-monitor.json: | + { + "plugin": "kmsg", + "logPath": "/dev/kmsg", + "lookback": "5m", + "bufferSize": 10, + "source": "kernel-monitor", + "conditions": [ + { + "type": "KernelDeadlock", + "reason": "KernelHasNoDeadlock", + "message": "kernel has no deadlock" + }, + { + "type": "ReadonlyFilesystem", + "reason": "FilesystemIsNotReadOnly", + "message": "Filesystem is not read-only" + } + ], + "rules": [ + { + "type": "temporary", + "reason": "OOMKilling", + "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*" + }, + { + "type": "temporary", + "reason": "TaskHung", + "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "temporary", + "reason": "UnregisterNetDevice", + "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "divide error: 0000 \\[#\\d+\\] SMP" + }, + { + "type": "temporary", + "reason": "MemoryReadError", + "pattern": "CE memory read error .*" + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "AUFSUmountHung", + "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "DockerHung", + "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "ReadonlyFilesystem", + "reason": "FilesystemIsReadOnly", + "pattern": "Remounting filesystem read-only" + } + ] + } + docker-monitor.json: | + { + "plugin": "journald", + "pluginConfig": { + "source": "dockerd" + }, + "logPath": "/var/log/journal", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "conditions": [], + "rules": [ + { + "type": "temporary", + "reason": "CorruptDockerImage", + "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*" + } + ] + } +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system +{{ end }} diff --git a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go index 3074feaa2b..cd8ca16541 100644 --- a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go +++ b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go @@ -594,6 +594,27 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*chann } } + npd := b.Cluster.Spec.NodeProblemDetector + + if npd != nil && fi.BoolValue(npd.Enabled) { + + key := "node-problem-detector.addons.k8s.io" + version := "0.8.8" + + { + location := key + "/k8s-1.17.yaml" + id := "k8s-1.17" + + addons.Spec.Addons = append(addons.Spec.Addons, &channelsapi.AddonSpec{ + Name: fi.String(key), + Version: fi.String(version), + Selector: map[string]string{"k8s-addon": key}, + Manifest: fi.String(location), + Id: id, + }) + } + } + if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) { key := "aws-load-balancer-controller.addons.k8s.io" diff --git a/upup/pkg/fi/cloudup/populate_cluster_spec.go b/upup/pkg/fi/cloudup/populate_cluster_spec.go index 8a7cecbe39..f7bd709729 100644 --- a/upup/pkg/fi/cloudup/populate_cluster_spec.go +++ b/upup/pkg/fi/cloudup/populate_cluster_spec.go @@ -279,6 +279,7 @@ func (c *populateClusterSpec) run(clientset simple.Clientset) error { codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext}) + codeModels = append(codeModels, &components.NodeProblemDetectorOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSCloudControllerManagerOptionsBuilder{OptionsContext: optionsContext}) }