mirror of https://github.com/kubernetes/kops.git
[addons] Introduce NodeProblemDetector
Node Problem Detector aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver so to avoid scheduling new pods on bad nodes and also easily identify which are the problems on underlying nodes. Project Home: https://github.com/kubernetes/node-problem-detector Signed-off-by: dntosas <ntosas@gmail.com>
This commit is contained in:
parent
466877c8d5
commit
20124d3ba9
|
@ -190,6 +190,19 @@ The kOps CLI requires additional IAM permissions to manage the requisite EventBr
|
|||
|
||||
**Warning: If you switch between the two operating modes on an existing cluster, the old resources have to be manually deleted. For IMDS to Queue Processor, this means deleting the k8s nth daemonset. For Queue Processor to IMDS, this means deleting the k8s nth deployment and the AWS resources: the SQS queue, EventBridge rules, and ASG Lifecycle hooks.**
|
||||
|
||||
#### Node Problem Detector
|
||||
|
||||
{{ kops_feature_table(kops_added_default='1.22') }}
|
||||
|
||||
[Node Problem Detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver.
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
nodeProblemDetector:
|
||||
enabled: true
|
||||
memoryRequest: 32Mi
|
||||
cpuRequest: 10m
|
||||
```
|
||||
#### Snapshot controller
|
||||
|
||||
{{ kops_feature_table(kops_added_default='1.21', k8s_min='1.20') }}
|
||||
|
|
|
@ -3965,6 +3965,51 @@ spec:
|
|||
items:
|
||||
type: string
|
||||
type: array
|
||||
nodeProblemDetector:
|
||||
description: NodeProblemDetector determines the node problem detector
|
||||
configuration.
|
||||
properties:
|
||||
cpuLimit:
|
||||
anyOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
description: 'CPULimit of NodeProblemDetector container. Default:
|
||||
10m'
|
||||
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
|
||||
x-kubernetes-int-or-string: true
|
||||
cpuRequest:
|
||||
anyOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
description: 'CPURequest of NodeProblemDetector container. Default:
|
||||
10m'
|
||||
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
|
||||
x-kubernetes-int-or-string: true
|
||||
enabled:
|
||||
description: 'Enabled enables the NodeProblemDetector. Default:
|
||||
false'
|
||||
type: boolean
|
||||
image:
|
||||
description: Image is the NodeProblemDetector docker container
|
||||
used.
|
||||
type: string
|
||||
memoryLimit:
|
||||
anyOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
description: 'MemoryLimit of NodeProblemDetector container. Default:
|
||||
80Mi'
|
||||
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
|
||||
x-kubernetes-int-or-string: true
|
||||
memoryRequest:
|
||||
anyOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
description: 'MemoryRequest of NodeProblemDetector container.
|
||||
Default: 80Mi'
|
||||
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
|
||||
x-kubernetes-int-or-string: true
|
||||
type: object
|
||||
nodeTerminationHandler:
|
||||
description: NodeTerminationHandler determines the cluster autoscaler
|
||||
configuration.
|
||||
|
|
|
@ -161,6 +161,8 @@ type ClusterSpec struct {
|
|||
|
||||
// NodeTerminationHandler determines the node termination handler configuration.
|
||||
NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"`
|
||||
// NodeProblemDetector determines the node problem detector configuration.
|
||||
NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"`
|
||||
// MetricsServer determines the metrics server configuration.
|
||||
MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"`
|
||||
// CertManager determines the metrics server configuration.
|
||||
|
|
|
@ -888,6 +888,28 @@ type NodeTerminationHandlerConfig struct {
|
|||
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
|
||||
}
|
||||
|
||||
// NodeProblemDetector determines the node problem detector configuration.
|
||||
type NodeProblemDetectorConfig struct {
|
||||
// Enabled enables the NodeProblemDetector.
|
||||
// Default: false
|
||||
Enabled *bool `json:"enabled,omitempty"`
|
||||
// Image is the NodeProblemDetector docker container used.
|
||||
Image *string `json:"image,omitempty"`
|
||||
|
||||
// MemoryRequest of NodeProblemDetector container.
|
||||
// Default: 80Mi
|
||||
MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"`
|
||||
// CPURequest of NodeProblemDetector container.
|
||||
// Default: 10m
|
||||
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
|
||||
// MemoryLimit of NodeProblemDetector container.
|
||||
// Default: 80Mi
|
||||
MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"`
|
||||
// CPULimit of NodeProblemDetector container.
|
||||
// Default: 10m
|
||||
CPULimit *resource.Quantity `json:"cpuLimit,omitempty"`
|
||||
}
|
||||
|
||||
// ClusterAutoscalerConfig determines the cluster autoscaler configuration.
|
||||
type ClusterAutoscalerConfig struct {
|
||||
// Enabled enables the cluster autoscaler.
|
||||
|
|
|
@ -160,6 +160,8 @@ type ClusterSpec struct {
|
|||
|
||||
// NodeTerminationHandler determines the cluster autoscaler configuration.
|
||||
NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"`
|
||||
// NodeProblemDetector determines the node problem detector configuration.
|
||||
NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"`
|
||||
// MetricsServer determines the metrics server configuration.
|
||||
MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"`
|
||||
// CertManager determines the metrics server configuration.
|
||||
|
|
|
@ -887,6 +887,28 @@ type NodeTerminationHandlerConfig struct {
|
|||
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
|
||||
}
|
||||
|
||||
// NodeProblemDetector determines the node problem detector configuration.
|
||||
type NodeProblemDetectorConfig struct {
|
||||
// Enabled enables the NodeProblemDetector.
|
||||
// Default: false
|
||||
Enabled *bool `json:"enabled,omitempty"`
|
||||
// Image is the NodeProblemDetector docker container used.
|
||||
Image *string `json:"image,omitempty"`
|
||||
|
||||
// MemoryRequest of NodeProblemDetector container.
|
||||
// Default: 80Mi
|
||||
MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"`
|
||||
// CPURequest of NodeProblemDetector container.
|
||||
// Default: 10m
|
||||
CPURequest *resource.Quantity `json:"cpuRequest,omitempty"`
|
||||
// MemoryLimit of NodeProblemDetector container.
|
||||
// Default: 80Mi
|
||||
MemoryLimit *resource.Quantity `json:"memoryLimit,omitempty"`
|
||||
// CPULimit of NodeProblemDetector container.
|
||||
// Default: 10m
|
||||
CPULimit *resource.Quantity `json:"cpuLimit,omitempty"`
|
||||
}
|
||||
|
||||
// ClusterAutoscalerConfig determines the cluster autoscaler configuration.
|
||||
type ClusterAutoscalerConfig struct {
|
||||
// Enabled enables the cluster autoscaler.
|
||||
|
|
|
@ -853,6 +853,16 @@ func RegisterConversions(s *runtime.Scheme) error {
|
|||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.AddGeneratedConversionFunc((*NodeProblemDetectorConfig)(nil), (*kops.NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
|
||||
return Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(a.(*NodeProblemDetectorConfig), b.(*kops.NodeProblemDetectorConfig), scope)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.AddGeneratedConversionFunc((*kops.NodeProblemDetectorConfig)(nil), (*NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
|
||||
return Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(a.(*kops.NodeProblemDetectorConfig), b.(*NodeProblemDetectorConfig), scope)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
|
||||
return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope)
|
||||
}); err != nil {
|
||||
|
@ -2437,6 +2447,15 @@ func autoConvert_v1alpha2_ClusterSpec_To_kops_ClusterSpec(in *ClusterSpec, out *
|
|||
} else {
|
||||
out.NodeTerminationHandler = nil
|
||||
}
|
||||
if in.NodeProblemDetector != nil {
|
||||
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
|
||||
*out = new(kops.NodeProblemDetectorConfig)
|
||||
if err := Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.NodeProblemDetector = nil
|
||||
}
|
||||
if in.MetricsServer != nil {
|
||||
in, out := &in.MetricsServer, &out.MetricsServer
|
||||
*out = new(kops.MetricsServerConfig)
|
||||
|
@ -2840,6 +2859,15 @@ func autoConvert_kops_ClusterSpec_To_v1alpha2_ClusterSpec(in *kops.ClusterSpec,
|
|||
} else {
|
||||
out.NodeTerminationHandler = nil
|
||||
}
|
||||
if in.NodeProblemDetector != nil {
|
||||
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
|
||||
*out = new(NodeProblemDetectorConfig)
|
||||
if err := Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.NodeProblemDetector = nil
|
||||
}
|
||||
if in.MetricsServer != nil {
|
||||
in, out := &in.MetricsServer, &out.MetricsServer
|
||||
*out = new(MetricsServerConfig)
|
||||
|
@ -5906,6 +5934,36 @@ func Convert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in *kops.Nod
|
|||
return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s)
|
||||
}
|
||||
|
||||
func autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error {
|
||||
out.Enabled = in.Enabled
|
||||
out.Image = in.Image
|
||||
out.MemoryRequest = in.MemoryRequest
|
||||
out.CPURequest = in.CPURequest
|
||||
out.MemoryLimit = in.MemoryLimit
|
||||
out.CPULimit = in.CPULimit
|
||||
return nil
|
||||
}
|
||||
|
||||
// Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig is an autogenerated conversion function.
|
||||
func Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error {
|
||||
return autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in, out, s)
|
||||
}
|
||||
|
||||
func autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error {
|
||||
out.Enabled = in.Enabled
|
||||
out.Image = in.Image
|
||||
out.MemoryRequest = in.MemoryRequest
|
||||
out.CPURequest = in.CPURequest
|
||||
out.MemoryLimit = in.MemoryLimit
|
||||
out.CPULimit = in.CPULimit
|
||||
return nil
|
||||
}
|
||||
|
||||
// Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig is an autogenerated conversion function.
|
||||
func Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error {
|
||||
return autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in, out, s)
|
||||
}
|
||||
|
||||
func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error {
|
||||
out.Enabled = in.Enabled
|
||||
out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining
|
||||
|
|
|
@ -1025,6 +1025,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) {
|
|||
*out = new(NodeTerminationHandlerConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.NodeProblemDetector != nil {
|
||||
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
|
||||
*out = new(NodeProblemDetectorConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.MetricsServer != nil {
|
||||
in, out := &in.MetricsServer, &out.MetricsServer
|
||||
*out = new(MetricsServerConfig)
|
||||
|
@ -3934,6 +3939,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig {
|
|||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) {
|
||||
*out = *in
|
||||
if in.Enabled != nil {
|
||||
in, out := &in.Enabled, &out.Enabled
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.Image != nil {
|
||||
in, out := &in.Image, &out.Image
|
||||
*out = new(string)
|
||||
**out = **in
|
||||
}
|
||||
if in.MemoryRequest != nil {
|
||||
in, out := &in.MemoryRequest, &out.MemoryRequest
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.CPURequest != nil {
|
||||
in, out := &in.CPURequest, &out.CPURequest
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.MemoryLimit != nil {
|
||||
in, out := &in.MemoryLimit, &out.MemoryLimit
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.CPULimit != nil {
|
||||
in, out := &in.CPULimit, &out.CPULimit
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig.
|
||||
func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig {
|
||||
if in == nil {
|
||||
return nil
|
||||
}
|
||||
out := new(NodeProblemDetectorConfig)
|
||||
in.DeepCopyInto(out)
|
||||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) {
|
||||
*out = *in
|
||||
|
|
|
@ -1109,6 +1109,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) {
|
|||
*out = new(NodeTerminationHandlerConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.NodeProblemDetector != nil {
|
||||
in, out := &in.NodeProblemDetector, &out.NodeProblemDetector
|
||||
*out = new(NodeProblemDetectorConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.MetricsServer != nil {
|
||||
in, out := &in.MetricsServer, &out.MetricsServer
|
||||
*out = new(MetricsServerConfig)
|
||||
|
@ -4116,6 +4121,52 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig {
|
|||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) {
|
||||
*out = *in
|
||||
if in.Enabled != nil {
|
||||
in, out := &in.Enabled, &out.Enabled
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.Image != nil {
|
||||
in, out := &in.Image, &out.Image
|
||||
*out = new(string)
|
||||
**out = **in
|
||||
}
|
||||
if in.MemoryRequest != nil {
|
||||
in, out := &in.MemoryRequest, &out.MemoryRequest
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.CPURequest != nil {
|
||||
in, out := &in.CPURequest, &out.CPURequest
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.MemoryLimit != nil {
|
||||
in, out := &in.MemoryLimit, &out.MemoryLimit
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
if in.CPULimit != nil {
|
||||
in, out := &in.CPULimit, &out.CPULimit
|
||||
x := (*in).DeepCopy()
|
||||
*out = &x
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig.
|
||||
func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig {
|
||||
if in == nil {
|
||||
return nil
|
||||
}
|
||||
out := new(NodeProblemDetectorConfig)
|
||||
in.DeepCopyInto(out)
|
||||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) {
|
||||
*out = *in
|
||||
|
|
|
@ -22,6 +22,7 @@ go_library(
|
|||
"kubeproxy.go",
|
||||
"kubescheduler.go",
|
||||
"networking.go",
|
||||
"nodeproblemdetector.go",
|
||||
"nodeterminationhandler.go",
|
||||
"openstack.go",
|
||||
],
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package components
|
||||
|
||||
import (
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/kops/pkg/apis/kops"
|
||||
"k8s.io/kops/upup/pkg/fi"
|
||||
"k8s.io/kops/upup/pkg/fi/loader"
|
||||
)
|
||||
|
||||
// NodeProblemDetectorOptionsBuilder adds options for the node problem detector to the model.
|
||||
type NodeProblemDetectorOptionsBuilder struct {
|
||||
*OptionsContext
|
||||
}
|
||||
|
||||
var _ loader.OptionsBuilder = &NodeProblemDetectorOptionsBuilder{}
|
||||
|
||||
func (b *NodeProblemDetectorOptionsBuilder) BuildOptions(o interface{}) error {
|
||||
clusterSpec := o.(*kops.ClusterSpec)
|
||||
if clusterSpec.NodeProblemDetector == nil {
|
||||
return nil
|
||||
}
|
||||
npd := clusterSpec.NodeProblemDetector
|
||||
|
||||
if npd.Enabled == nil {
|
||||
npd.Enabled = fi.Bool(false)
|
||||
}
|
||||
|
||||
if npd.CPURequest == nil {
|
||||
defaultCPURequest := resource.MustParse("10m")
|
||||
npd.CPURequest = &defaultCPURequest
|
||||
}
|
||||
|
||||
if npd.MemoryRequest == nil {
|
||||
defaultMemoryRequest := resource.MustParse("80Mi")
|
||||
npd.MemoryRequest = &defaultMemoryRequest
|
||||
}
|
||||
|
||||
if npd.CPULimit == nil {
|
||||
defaultCPULimit := resource.MustParse("10m")
|
||||
npd.CPULimit = &defaultCPULimit
|
||||
}
|
||||
|
||||
if npd.MemoryLimit == nil {
|
||||
defaultMemoryLimit := resource.MustParse("80Mi")
|
||||
npd.MemoryLimit = &defaultMemoryLimit
|
||||
}
|
||||
|
||||
if npd.Image == nil {
|
||||
npd.Image = fi.String("k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.8")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -50,6 +50,7 @@ go_library(
|
|||
"cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template",
|
||||
"cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template",
|
||||
"cloudup/resources/addons/snapshot-controller.addons.k8s.io/k8s-1.20.yaml.template",
|
||||
"cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.17.yaml.template",
|
||||
],
|
||||
importpath = "k8s.io/kops/upup/models",
|
||||
visibility = ["//visibility:public"],
|
||||
|
|
|
@ -0,0 +1,188 @@
|
|||
{{ with .NodeProblemDetector }}
|
||||
# Sourced from https://github.com/kubernetes/node-problem-detector/tree/v0.8.8
|
||||
---
|
||||
# Source: node-problem-detector/deployment/node-problem-detector.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-problem-detector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
|
||||
image: {{ .Image }}
|
||||
resources:
|
||||
limits:
|
||||
cpu: {{ .CPULimit }}
|
||||
memory: {{ .MemoryLimit }}
|
||||
requests:
|
||||
cpu: {{ .CPURequest }}
|
||||
memory: {{ .MemoryRequest }}
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
# Make sure node problem detector is in the same timezone
|
||||
# with the host.
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: config
|
||||
configMap:
|
||||
name: node-problem-detector-config
|
||||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
priorityClassName: system-node-critical
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
---
|
||||
# Source: node-problem-detector/deployment/node-problem-detector-config.yaml
|
||||
apiVersion: v1
|
||||
data:
|
||||
kernel-monitor.json: |
|
||||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "kernel-monitor",
|
||||
"conditions": [
|
||||
{
|
||||
"type": "KernelDeadlock",
|
||||
"reason": "KernelHasNoDeadlock",
|
||||
"message": "kernel has no deadlock"
|
||||
},
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "OOMKilling",
|
||||
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "TaskHung",
|
||||
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "UnregisterNetDevice",
|
||||
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
docker-monitor.json: |
|
||||
{
|
||||
"plugin": "journald",
|
||||
"pluginConfig": {
|
||||
"source": "dockerd"
|
||||
},
|
||||
"logPath": "/var/log/journal",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "docker-monitor",
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CorruptDockerImage",
|
||||
"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
|
||||
}
|
||||
]
|
||||
}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: node-problem-detector-config
|
||||
namespace: kube-system
|
||||
{{ end }}
|
|
@ -594,6 +594,27 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*chann
|
|||
}
|
||||
}
|
||||
|
||||
npd := b.Cluster.Spec.NodeProblemDetector
|
||||
|
||||
if npd != nil && fi.BoolValue(npd.Enabled) {
|
||||
|
||||
key := "node-problem-detector.addons.k8s.io"
|
||||
version := "0.8.8"
|
||||
|
||||
{
|
||||
location := key + "/k8s-1.17.yaml"
|
||||
id := "k8s-1.17"
|
||||
|
||||
addons.Spec.Addons = append(addons.Spec.Addons, &channelsapi.AddonSpec{
|
||||
Name: fi.String(key),
|
||||
Version: fi.String(version),
|
||||
Selector: map[string]string{"k8s-addon": key},
|
||||
Manifest: fi.String(location),
|
||||
Id: id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) {
|
||||
|
||||
key := "aws-load-balancer-controller.addons.k8s.io"
|
||||
|
|
|
@ -280,6 +280,7 @@ func (c *populateClusterSpec) run(clientset simple.Clientset) error {
|
|||
codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext})
|
||||
codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext})
|
||||
codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext})
|
||||
codeModels = append(codeModels, &components.NodeProblemDetectorOptionsBuilder{OptionsContext: optionsContext})
|
||||
codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext})
|
||||
codeModels = append(codeModels, &components.AWSCloudControllerManagerOptionsBuilder{OptionsContext: optionsContext})
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue