Compare commits
No commits in common. "master" and "v0.8.8" have entirely different histories.
|
@ -1,38 +0,0 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: github-actions
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
groups:
|
||||
actions-all:
|
||||
patterns:
|
||||
- "*"
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: docker
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: gomod
|
||||
directories:
|
||||
- /
|
||||
- /test
|
||||
schedule:
|
||||
interval: weekly
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types:
|
||||
- "version-update:semver-major"
|
||||
- "version-update:semver-minor"
|
||||
groups:
|
||||
k8s:
|
||||
patterns:
|
||||
- "k8s.io/*"
|
||||
- "sigs.k8s.io/*"
|
||||
labels:
|
||||
- "ok-to-test"
|
|
@ -1,78 +0,0 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: ["master"]
|
||||
schedule:
|
||||
- cron: "0 0 * * 1"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: ["go"]
|
||||
# CodeQL supports [ $supported-codeql-languages ]
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
|
@ -1,27 +0,0 @@
|
|||
# Dependency Review Action
|
||||
#
|
||||
# This Action will scan dependency manifest files that change as part of a Pull Request,
|
||||
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
|
||||
# Once installed, if the workflow run is marked as required,
|
||||
# PRs introducing known-vulnerable packages will be blocked from merging.
|
||||
#
|
||||
# Source repository: https://github.com/actions/dependency-review-action
|
||||
name: 'Dependency Review'
|
||||
on: [pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
dependency-review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: 'Checkout Repository'
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: 'Dependency Review'
|
||||
uses: actions/dependency-review-action@da24556b548a50705dd671f47852072ea4c105d9 # v4.7.1
|
|
@ -1,76 +0,0 @@
|
|||
# This workflow uses actions that are not certified by GitHub. They are provided
|
||||
# by a third-party and are governed by separate terms of service, privacy
|
||||
# policy, and support documentation.
|
||||
|
||||
name: Scorecard supply-chain security
|
||||
on:
|
||||
# For Branch-Protection check. Only the default branch is supported. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
|
||||
branch_protection_rule:
|
||||
# To guarantee Maintained check is occasionally updated. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
|
||||
schedule:
|
||||
- cron: '20 7 * * 2'
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecard analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results and get a badge (see publish_results below).
|
||||
id-token: write
|
||||
contents: read
|
||||
actions: read
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
|
||||
# - you want to enable the Branch-Protection check on a *public* repository, or
|
||||
# - you are installing Scorecards on a *private* repository
|
||||
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
|
||||
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
|
||||
|
||||
# Public repositories:
|
||||
# - Publish results to OpenSSF REST API for easy access by consumers
|
||||
# - Allows the repository to include the Scorecard badge.
|
||||
# - See https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories:
|
||||
# - `publish_results` will always be set to `false`, regardless
|
||||
# of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -1,33 +0,0 @@
|
|||
name: tag-release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- version.txt
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
if: ${{ github.repository == 'kubernetes/node-problem-detector' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- run: /usr/bin/git config --global user.email actions@github.com
|
||||
- run: /usr/bin/git config --global user.name 'GitHub Actions Release Tagger'
|
||||
- run: hack/tag-release.sh
|
||||
id: tag_release
|
||||
outputs:
|
||||
release_tag: ${{ steps.tag_release.outputs.release_tag }}
|
|
@ -6,5 +6,3 @@ pr.env
|
|||
junit*.xml
|
||||
debug.test
|
||||
/output/
|
||||
coverage.out
|
||||
.idea/
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
repos:
|
||||
- repo: https://github.com/gitleaks/gitleaks
|
||||
rev: v8.16.3
|
||||
hooks:
|
||||
- id: gitleaks
|
||||
- repo: https://github.com/golangci/golangci-lint
|
||||
rev: v1.52.2
|
||||
hooks:
|
||||
- id: golangci-lint
|
||||
- repo: https://github.com/jumanjihouse/pre-commit-hooks
|
||||
rev: 3.0.0
|
||||
hooks:
|
||||
- id: shellcheck
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
|
@ -0,0 +1,33 @@
|
|||
os:
|
||||
- linux
|
||||
sudo: required
|
||||
dist: xenial
|
||||
language: go
|
||||
go:
|
||||
- "1.16"
|
||||
- master
|
||||
env:
|
||||
- GO111MODULE=on
|
||||
services:
|
||||
- docker
|
||||
before_install:
|
||||
- sudo apt-get -qq update
|
||||
- sudo apt-get install -y libsystemd-dev
|
||||
install:
|
||||
- mkdir -p $HOME/gopath/src/k8s.io
|
||||
- mv $TRAVIS_BUILD_DIR $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
- cd $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
script:
|
||||
- make
|
||||
- make test
|
||||
- make clean && BUILD_TAGS="disable_custom_plugin_monitor" make
|
||||
- BUILD_TAGS="disable_custom_plugin_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_log_monitor" make
|
||||
- BUILD_TAGS="disable_system_log_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_stats_monitor" make
|
||||
- BUILD_TAGS="disable_system_stats_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_stackdriver_exporter" make
|
||||
- BUILD_TAGS="disable_stackdriver_exporter" make test
|
||||
- make clean && ENABLE_JOURNALD=0 make
|
||||
- ENABLE_JOURNALD=0 make test
|
||||
- ENABLE_JOURNALD=0 make build-binaries
|
10
CHANGELOG.md
10
CHANGELOG.md
|
@ -29,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
- Windows build now supported.
|
||||
- Added metrics to retrieve stats such as `procs_running` and `procs_blocked`.
|
||||
- Added metrics to retrieve network stats.
|
||||
- Added metric to retrieve guest OS features such as unknown modules, ktd,
|
||||
- Added metric to retrieve guest OS features such as unknwon modules, ktd,
|
||||
and kernel integrity.
|
||||
|
||||
### Changed
|
||||
|
@ -158,7 +158,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
- Empty LogPath will now use journald's default path.
|
||||
- Systemd monitor now looks back 5 minutes.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:1.0.0`.
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:1.0.0`.
|
||||
- Updated the detection method for docker overlay2 issues.
|
||||
- Moved NPD into the kube-system namespace.
|
||||
|
||||
|
@ -237,7 +237,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
- Added resource limites to NPD deployment.
|
||||
- Added log-counter to dockerfile.
|
||||
- Added `enable_message_change_based_condition_update` option to enable
|
||||
condition update when messages change for custom plugin.
|
||||
condition update when messages cahnge for custom plugin.
|
||||
|
||||
### Fixed
|
||||
|
||||
|
@ -248,7 +248,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
### Changed
|
||||
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.4.0`.
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.4.0`.
|
||||
|
||||
## [0.6.0] - 2018-11-27
|
||||
|
||||
|
@ -277,7 +277,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
- Changed default port from 10256 to 20256 to avoid conflict with kube-proxy.
|
||||
- Bumped golang version from 1.8 to 1.9.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.3`.
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.3`.
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ If your repo has certain guidelines for contribution, put them here ahead of the
|
|||
|
||||
- [Contributor License Agreement](https://git.k8s.io/community/CLA.md) Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests
|
||||
- [Kubernetes Contributor Guide](http://git.k8s.io/community/contributors/guide) - Main contributor documentation, or you can just jump directly to the [contributing section](http://git.k8s.io/community/contributors/guide#contributing)
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet/README.md) - Common resources for existing developers
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet.md) - Common resources for existing developers
|
||||
|
||||
## Mentorship
|
||||
|
||||
|
@ -28,4 +28,4 @@ Custom Information - if you're copying this template for the first time you can
|
|||
- [Slack channel](https://kubernetes.slack.com/messages/kubernetes-users) - Replace `kubernetes-users` with your slack channel string, this will send users directly to your channel.
|
||||
- [Mailing list](URL)
|
||||
|
||||
-->
|
||||
-->
|
38
Dockerfile
38
Dockerfile
|
@ -12,42 +12,20 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# "builder-base" can be overriden using dockerb buildx's --build-context flag,
|
||||
# by users who want to use a different images for the builder. E.g. if you need to use an older OS
|
||||
# to avoid dependencies on very recent glibc versions.
|
||||
# E.g. of the param: --build-context builder-base=docker-image://golang:<something>@sha256:<something>
|
||||
# Must override builder-base, not builder, since the latter is referred to later in the file and so must not be
|
||||
# directly replaced. See here, and note that "stage" parameter mentioned there has been renamed to
|
||||
# "build-context": https://github.com/docker/buildx/pull/904#issuecomment-1005871838
|
||||
FROM golang:1.24-bookworm@sha256:00eccd446e023d3cd9566c25a6e6a02b90db3e1e0bbe26a48fc29cd96e800901 as builder-base
|
||||
FROM builder-base as builder
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
ARG BASEIMAGE
|
||||
FROM ${BASEIMAGE}
|
||||
|
||||
ARG TARGETARCH
|
||||
MAINTAINER Random Liu <lantaol@google.com>
|
||||
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update --fix-missing && apt-get --yes install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
RUN go version
|
||||
|
||||
COPY . /gopath/src/k8s.io/node-problem-detector/
|
||||
WORKDIR /gopath/src/k8s.io/node-problem-detector
|
||||
RUN GOARCH=${TARGETARCH} make bin/node-problem-detector bin/health-checker bin/log-counter
|
||||
|
||||
FROM --platform=${TARGETPLATFORM} registry.k8s.io/build-image/debian-base:bookworm-v1.0.4@sha256:0a17678966f63e82e9c5e246d9e654836a33e13650a698adefede61bb5ca099e as base
|
||||
|
||||
LABEL maintainer="Random Liu <lantaol@google.com>"
|
||||
|
||||
RUN clean-install util-linux bash libsystemd-dev
|
||||
RUN clean-install util-linux libsystemd0 bash
|
||||
|
||||
# Avoid symlink of /etc/localtime.
|
||||
RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true
|
||||
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/node-problem-detector /node-problem-detector
|
||||
COPY ./bin/node-problem-detector /node-problem-detector
|
||||
|
||||
ARG LOGCOUNTER
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/health-checker /gopath/src/k8s.io/node-problem-detector/${LOGCOUNTER} /home/kubernetes/bin/
|
||||
COPY ./bin/health-checker ${LOGCOUNTER} /home/kubernetes/bin/
|
||||
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/config/ /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json"]
|
||||
COPY config /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json"]
|
||||
|
|
140
Makefile
140
Makefile
|
@ -17,16 +17,12 @@
|
|||
.PHONY: all \
|
||||
vet fmt version test e2e-test \
|
||||
build-binaries build-container build-tar build \
|
||||
docker-builder build-in-docker \
|
||||
push-container push-tar push release clean depup \
|
||||
print-tar-sha-md5
|
||||
docker-builder build-in-docker push-container push-tar push clean
|
||||
|
||||
all: build
|
||||
|
||||
# PLATFORMS is the set of OS_ARCH that NPD can build against.
|
||||
LINUX_PLATFORMS=linux_amd64 linux_arm64
|
||||
DOCKER_PLATFORMS=linux/amd64,linux/arm64
|
||||
PLATFORMS=$(LINUX_PLATFORMS) windows_amd64
|
||||
PLATFORMS=linux_amd64 windows_amd64
|
||||
|
||||
# VERSION is the version of the binary.
|
||||
VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else echo "UNKNOWN"; fi)
|
||||
|
@ -67,24 +63,21 @@ IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
|
|||
# support needs libsystemd-dev or libsystemd-journal-dev.
|
||||
ENABLE_JOURNALD?=1
|
||||
|
||||
ifeq ($(shell go env GOHOSTOS), darwin)
|
||||
ifeq ($(go env GOHOSTOS), darwin)
|
||||
ENABLE_JOURNALD=0
|
||||
else ifeq ($(shell go env GOHOSTOS), windows)
|
||||
else ifeq ($(go env GOHOSTOS), windows)
|
||||
ENABLE_JOURNALD=0
|
||||
endif
|
||||
|
||||
# TODO(random-liu): Support different architectures.
|
||||
# The debian-base:v1.0.0 image built from kubernetes repository is based on
|
||||
# Debian Stretch. It includes systemd 232 with support for both +XZ and +LZ4
|
||||
# compression. +LZ4 is needed on some os distros such as COS.
|
||||
BASEIMAGE:=k8s.gcr.io/debian-base-amd64:v1.0.0
|
||||
|
||||
# Disable cgo by default to make the binary statically linked.
|
||||
CGO_ENABLED:=0
|
||||
|
||||
ifeq ($(GOARCH), arm64)
|
||||
CC:=aarch64-linux-gnu-gcc
|
||||
else
|
||||
CC:=x86_64-linux-gnu-gcc
|
||||
endif
|
||||
|
||||
# Set default Go architecture to AMD64.
|
||||
GOARCH ?= amd64
|
||||
|
||||
# Construct the "-tags" parameter used by "go build".
|
||||
BUILD_TAGS?=
|
||||
|
||||
|
@ -108,15 +101,15 @@ ifeq ($(ENABLE_JOURNALD), 1)
|
|||
CGO_ENABLED:=1
|
||||
LOGCOUNTER=./bin/log-counter
|
||||
else
|
||||
# Hack: Don't copy over log-counter, use a wildcard path that shouldn't match
|
||||
# Hack: Don't copy over log-counter, use a wildcard path that shouldnt match
|
||||
# anything in COPY command.
|
||||
LOGCOUNTER=*dont-include-log-counter
|
||||
endif
|
||||
|
||||
vet:
|
||||
go list -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
|
||||
GO111MODULE=on go list -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
|
||||
grep -v "./vendor/*" | \
|
||||
xargs go vet -tags "$(HOST_PLATFORM_BUILD_TAGS)"
|
||||
GO111MODULE=on xargs go vet -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)"
|
||||
|
||||
fmt:
|
||||
find . -type f -name "*.go" | grep -v "./vendor/*" | xargs gofmt -s -w -l
|
||||
|
@ -130,13 +123,12 @@ ifeq ($(ENABLE_JOURNALD), 1)
|
|||
BINARIES_LINUX_ONLY += bin/log-counter
|
||||
endif
|
||||
|
||||
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) \
|
||||
$(foreach platform, $(LINUX_PLATFORMS), $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/$(platform)/$(binary))) \
|
||||
$(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
|
||||
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/linux_amd64/$(binary)) $(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
|
||||
ALL_TARBALLS = $(foreach platform, $(PLATFORMS), $(NPD_NAME_VERSION)-$(platform).tar.gz)
|
||||
|
||||
output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
|
@ -144,15 +136,15 @@ output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
|
|||
touch $@
|
||||
|
||||
output/windows_amd64/test/bin/%.exe: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
-o ../$@ \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
./test/e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_amd64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
|
@ -160,34 +152,17 @@ output/linux_amd64/bin/%: $(PKG_SOURCES)
|
|||
touch $@
|
||||
|
||||
output/linux_amd64/test/bin/%: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_arm64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/$(subst -,,$*)
|
||||
touch $@
|
||||
|
||||
output/linux_arm64/test/bin/%: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
./test/e2e/$(subst -,,$*)
|
||||
|
||||
# In the future these targets should be deprecated.
|
||||
./bin/log-counter: $(PKG_SOURCES)
|
||||
ifeq ($(ENABLE_JOURNALD), 1)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o bin/log-counter \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
|
@ -197,37 +172,38 @@ else
|
|||
endif
|
||||
|
||||
./bin/node-problem-detector: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o bin/node-problem-detector \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/nodeproblemdetector
|
||||
|
||||
./test/bin/problem-maker: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/problem-maker \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o test/bin/problem-maker \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/problemmaker/problem_maker.go
|
||||
./test/e2e/problemmaker/problem_maker.go
|
||||
|
||||
./bin/health-checker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o bin/health-checker \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
cmd/healthchecker/health_checker.go
|
||||
|
||||
test: vet fmt
|
||||
go test -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
|
||||
GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
|
||||
|
||||
e2e-test: vet fmt build-tar
|
||||
cd test && \
|
||||
go run github.com/onsi/ginkgo/ginkgo -nodes=$(PARALLEL) -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
|
||||
./e2e/metriconly/... -- \
|
||||
GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
|
||||
./test/e2e/metriconly/... -- \
|
||||
-project=$(PROJECT) -zone=$(ZONE) \
|
||||
-image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \
|
||||
-ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \
|
||||
-npd-build-tar=`pwd`/../$(TARBALL) \
|
||||
-npd-build-tar=`pwd`/$(TARBALL) \
|
||||
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
|
||||
-artifacts-dir=$(ARTIFACTS)
|
||||
|
||||
|
@ -240,9 +216,8 @@ $(NPD_NAME_VERSION)-%.tar.gz: $(ALL_BINARIES) test/e2e-install.sh
|
|||
|
||||
build-binaries: $(ALL_BINARIES)
|
||||
|
||||
build-container: clean Dockerfile
|
||||
docker buildx create --platform $(DOCKER_PLATFORMS) --use
|
||||
docker buildx build --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
build-container: build-binaries Dockerfile
|
||||
docker build -t $(IMAGE) --build-arg BASEIMAGE=$(BASEIMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
|
||||
$(TARBALL): ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
|
||||
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
|
||||
|
@ -254,7 +229,7 @@ build-tar: $(TARBALL) $(ALL_TARBALLS)
|
|||
build: build-container build-tar
|
||||
|
||||
docker-builder:
|
||||
docker build -t npd-builder . --target=builder
|
||||
docker build -t npd-builder ./builder
|
||||
|
||||
build-in-docker: clean docker-builder
|
||||
docker run \
|
||||
|
@ -262,46 +237,17 @@ build-in-docker: clean docker-builder
|
|||
-c 'cd /gopath/src/k8s.io/node-problem-detector/ && make build-binaries'
|
||||
|
||||
push-container: build-container
|
||||
# So we can push to docker hub by setting REGISTRY
|
||||
ifneq (,$(findstring gcr.io,$(REGISTRY)))
|
||||
gcloud auth configure-docker
|
||||
endif
|
||||
# Build should be cached from build-container
|
||||
docker buildx build --push --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
docker push $(IMAGE)
|
||||
|
||||
push-tar: build-tar
|
||||
gsutil cp $(TARBALL) $(UPLOAD_PATH)/node-problem-detector/
|
||||
gsutil cp node-problem-detector-$(VERSION)-*.tar.gz* $(UPLOAD_PATH)/node-problem-detector/
|
||||
|
||||
# `make push` is used by presubmit and CI jobs.
|
||||
push: push-container push-tar
|
||||
|
||||
# `make release` is used when releasing a new NPD version.
|
||||
release: push-container build-tar print-tar-sha-md5
|
||||
|
||||
print-tar-sha-md5: build-tar
|
||||
./hack/print-tar-sha-md5.sh $(VERSION)
|
||||
|
||||
coverage.out:
|
||||
rm -f coverage.out
|
||||
go test -coverprofile=coverage.out -timeout=1m -v -short ./...
|
||||
|
||||
clean:
|
||||
rm -rf bin/
|
||||
rm -rf test/bin/
|
||||
rm -f node-problem-detector-*.tar.gz*
|
||||
rm -rf output/
|
||||
rm -f coverage.out
|
||||
|
||||
.PHONY: gomod
|
||||
gomod:
|
||||
go mod tidy
|
||||
go mod vendor
|
||||
cd test; go mod tidy
|
||||
|
||||
.PHONY: goget
|
||||
goget:
|
||||
go get $(shell go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all)
|
||||
|
||||
.PHONY: depup
|
||||
depup: goget gomod
|
||||
|
|
10
OWNERS
10
OWNERS
|
@ -1,14 +1,12 @@
|
|||
reviewers:
|
||||
- sig-node-reviewers
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
||||
- mmiranda96
|
||||
- hakman
|
||||
approvers:
|
||||
- sig-node-approvers
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
|
@ -1,19 +0,0 @@
|
|||
aliases:
|
||||
sig-node-approvers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
||||
- SergeyKanzhelev
|
||||
- tallclair
|
||||
sig-node-reviewers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
88
README.md
88
README.md
|
@ -7,11 +7,11 @@ layers in the cluster management stack.
|
|||
It is a daemon that runs on each node, detects node
|
||||
problems and reports them to apiserver.
|
||||
node-problem-detector can either run as a
|
||||
[DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) or run standalone.
|
||||
[DaemonSet](http://kubernetes.io/docs/admin/daemons/) or run standalone.
|
||||
Now it is running as a
|
||||
[Kubernetes Addon](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
|
||||
enabled by default in the GKE cluster. It is also enabled by default in AKS as part of the
|
||||
[AKS Linux Extension](https://learn.microsoft.com/en-us/azure/aks/faq#what-is-the-purpose-of-the-aks-linux-extension-i-see-installed-on-my-linux-vmss-instances).
|
||||
enabled by default in the GCE cluster.
|
||||
|
||||
# Background
|
||||
|
||||
There are tons of node problems that could possibly affect the pods running on the
|
||||
|
@ -41,8 +41,8 @@ should be reported as `Event`.
|
|||
|
||||
# Problem Daemon
|
||||
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors specific
|
||||
kinds of node problems and reports them to node-problem-detector.
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors a specific
|
||||
kind of node problems and reports them to node-problem-detector.
|
||||
|
||||
A problem daemon could be:
|
||||
* A tiny daemon designed for dedicated Kubernetes use-cases.
|
||||
|
@ -62,9 +62,9 @@ List of supported problem daemons types:
|
|||
| Problem Daemon Types | NodeCondition | Description | Configs | Disabling Build Tag |
|
||||
|----------------|:---------------:|:------------|:--------|:--------------------|
|
||||
| [SystemLogMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemlogmonitor) | KernelDeadlock ReadonlyFilesystem FrequentKubeletRestart FrequentDockerRestart FrequentContainerdRestart | A system log monitor monitors system log and reports problems and metrics according to predefined rules. | [filelog](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-filelog.json), [kmsg](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json), [kernel](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-counter.json) [abrt](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) [systemd](https://github.com/kubernetes/node-problem-detector/blob/master/config/systemd-monitor-counter.json) | disable_system_log_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | [system-stats-monitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | disable_system_stats_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | | disable_system_stats_monitor
|
||||
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor) | On-demand(According to users configuration), existing example: NTPProblem | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user-defined check scripts. See the proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | [example](https://github.com/kubernetes/node-problem-detector/blob/4ad49bbd84b8ced45ac825eac01ec93d9235935e/config/custom-plugin-monitor.json) | disable_custom_plugin_monitor
|
||||
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) [containerd](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-containerd.json) |
|
||||
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) |
|
||||
|
||||
# Exporter
|
||||
|
||||
|
@ -102,14 +102,9 @@ certain backends. Some of them can be disabled at compile-time using a build tag
|
|||
|
||||
* `--config.custom-plugin-monitor`: List of paths to custom plugin monitor config files, comma-separated, e.g.
|
||||
[config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
use different custom plugin monitors to monitor different node problems.
|
||||
|
||||
|
||||
#### For Health Checkers
|
||||
|
||||
Health checkers are configured as custom plugins, using the config/health-checker-*.json config files.
|
||||
|
||||
#### For Kubernetes exporter
|
||||
|
||||
* `--enable-k8s-exporter`: Enables reporting to Kubernetes API server, default to `true`.
|
||||
|
@ -142,12 +137,12 @@ For example, to run without auth, use the following config:
|
|||
|
||||
## Build Image
|
||||
|
||||
* Install development dependencies for `libsystemd` and the ARM GCC toolchain
|
||||
* Debian/Ubuntu: `apt install libsystemd-dev gcc-aarch64-linux-gnu`
|
||||
* `go get` or `git clone` node-problem-detector repo into `$GOPATH/src/k8s.io` or `$GOROOT/src/k8s.io`
|
||||
with one of the below directions:
|
||||
* `cd $GOPATH/src/k8s.io && git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
* `cd $GOPATH/src/k8s.io && go get k8s.io/node-problem-detector`
|
||||
|
||||
* `git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
|
||||
* Run `make` in the top directory. It will:
|
||||
* run `make` in the top directory. It will:
|
||||
* Build the binary.
|
||||
* Build the docker image. The binary and `config/` are copied into the docker image.
|
||||
|
||||
|
@ -163,6 +158,11 @@ and [System Stats Monitor](https://github.com/kubernetes/node-problem-detector/t
|
|||
Check out the [Problem Daemon](https://github.com/kubernetes/node-problem-detector#problem-daemon) section
|
||||
to see how to disable each problem daemon during compilation time.
|
||||
|
||||
**Note**:
|
||||
By default, node-problem-detector will be built with systemd support with the `make` command. This requires systemd develop files.
|
||||
You should download the systemd develop files first. For Ubuntu, the `libsystemd-journal-dev` package should
|
||||
be installed. For Debian, the `libsystemd-dev` package should be installed.
|
||||
|
||||
## Push Image
|
||||
|
||||
`make push` uploads the docker image to a registry. By default, the image will be uploaded to
|
||||
|
@ -175,7 +175,7 @@ The easiest way to install node-problem-detector into your cluster is to use the
|
|||
|
||||
```
|
||||
helm repo add deliveryhero https://charts.deliveryhero.io/
|
||||
helm install --generate-name deliveryhero/node-problem-detector
|
||||
helm install deliveryhero/node-problem-detector
|
||||
```
|
||||
|
||||
Alternatively, to install node-problem-detector manually:
|
||||
|
@ -184,13 +184,9 @@ Alternatively, to install node-problem-detector manually:
|
|||
|
||||
2. Edit [node-problem-detector-config.yaml](deployment/node-problem-detector-config.yaml) to configure node-problem-detector.
|
||||
|
||||
3. Edit [rbac.yaml](deployment/rbac.yaml) to fit your environment.
|
||||
3. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
|
||||
4. Create the ServiceAccount and ClusterRoleBinding with `kubectl create -f rbac.yaml`.
|
||||
|
||||
4. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
|
||||
5. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
3. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
|
||||
## Start Standalone
|
||||
|
||||
|
@ -218,7 +214,7 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
|
|||
* [Go](https://golang.org/)
|
||||
* [Visual Studio Code](https://code.visualstudio.com/)
|
||||
* [Make](http://gnuwin32.sourceforge.net/packages/make.htm)
|
||||
* [mingw-64 WinBuilds](http://mingw-w64.org/downloads)
|
||||
* [mingw-64 WinBuilds](http://mingw-w64.org/doku.php/download/win-builds)
|
||||
* Tested with x86-64 Windows Native mode.
|
||||
* Add the `$InstallDir\bin` to [Windows `PATH` variable](https://answers.microsoft.com/en-us/windows/forum/windows_10-other_settings-winpc/adding-path-variable/97300613-20cb-4d85-8d0e-cc9d3549ba23).
|
||||
|
||||
|
@ -226,16 +222,16 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
|
|||
# Run these commands in the node-problem-detector directory.
|
||||
|
||||
# Build in MINGW64 Window
|
||||
make clean ENABLE_JOURNALD=0 build-binaries
|
||||
make clean windows-binaries
|
||||
|
||||
# Test in MINGW64 Window
|
||||
make test
|
||||
|
||||
# Run with containerd log monitoring enabled in Command Prompt. (Assumes containerd is installed.)
|
||||
%CD%\output\windows_amd64\bin\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
|
||||
%CD%\output\windows_amd64\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
|
||||
|
||||
# Configure NPD to run as a Windows Service
|
||||
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
|
||||
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
|
||||
sc.exe failure NodeProblemDetector reset= 0 actions= restart/10000
|
||||
sc.exe start NodeProblemDetector
|
||||
```
|
||||
|
@ -268,9 +264,9 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
|
|||
node-problem-detector uses [go modules](https://github.com/golang/go/wiki/Modules)
|
||||
to manage dependencies. Therefore, building node-problem-detector requires
|
||||
golang 1.11+. It still uses vendoring. See the
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-architecture/917-go-modules#alternatives-to-vendoring-using-go-modules)
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-architecture/2019-03-19-go-modules.md#alternatives-to-vendoring-using-go-modules)
|
||||
for the design decisions. To add a new dependency, update [go.mod](go.mod) and
|
||||
run `go mod vendor`.
|
||||
run `GO111MODULE=on go mod vendor`.
|
||||
|
||||
# Remedy Systems
|
||||
|
||||
|
@ -279,26 +275,30 @@ detected by the node-problem-detector. Remedy systems observe events and/or node
|
|||
conditions emitted by the node-problem-detector and take action to return the
|
||||
Kubernetes cluster to a healthy state. The following remedy systems exist:
|
||||
|
||||
* [**Draino**](https://github.com/planetlabs/draino) automatically drains Kubernetes
|
||||
nodes based on labels and node conditions. Nodes that match _all_ of the supplied
|
||||
labels and _any_ of the supplied node conditions will be prevented from accepting
|
||||
new pods (aka 'cordoned') immediately, and
|
||||
[drained](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
|
||||
after a configurable time. Draino can be used in conjunction with the
|
||||
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
to automatically terminate drained nodes. Refer to
|
||||
[this issue](https://github.com/kubernetes/node-problem-detector/issues/199)
|
||||
for an example production use case for Draino.
|
||||
* [**Descheduler**](https://github.com/kubernetes-sigs/descheduler) strategy RemovePodsViolatingNodeTaints
|
||||
evicts pods violating NoSchedule taints on nodes. The k8s scheduler's TaintNodesByCondition feature must
|
||||
be enabled. The [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
can be used to automatically terminate drained nodes.
|
||||
* [**mediK8S**](https://github.com/medik8s) is an umbrella project for automatic remediation
|
||||
system build on [Node Health Check Operator (NHC)](https://github.com/medik8s/node-healthcheck-operator) that monitors
|
||||
node conditions and delegates remediation to external remediators using the Remediation API.[Poison-Pill](https://github.com/medik8s/poison-pill)
|
||||
is a remediator that will reboot the node and make sure all statefull workloads are rescheduled. NHC supports conditionally remediating if the cluster
|
||||
has enough healthy capacity, or manually pausing any action to minimze cluster disruption.
|
||||
* [**MachineHealthCheck**](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check) of [Cluster API](https://cluster-api.sigs.k8s.io/) are responsible for remediating unhealthy Machines.
|
||||
|
||||
# Testing
|
||||
|
||||
NPD is tested via unit tests, [NPD e2e tests](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md), Kubernetes e2e tests and Kubernetes nodes e2e tests. Prow handles the [pre-submit tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-presubmits.yaml) and [CI tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-ci.yaml).
|
||||
|
||||
CI test results can be found below:
|
||||
1. [Unit tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
1. [Unit tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
|
||||
## Running tests
|
||||
|
||||
|
@ -310,10 +310,6 @@ See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-dete
|
|||
|
||||
[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY intended to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems.
|
||||
|
||||
# Compatibility
|
||||
|
||||
Node problem detector's architecture has been fairly stable. Recent versions (v0.8.13+) should be able to work with any supported kubernetes versions.
|
||||
|
||||
# Docs
|
||||
|
||||
* [Custom plugin monitor](docs/custom_plugin_monitor.md)
|
||||
|
@ -324,4 +320,4 @@ Node problem detector's architecture has been fairly stable. Recent versions (v0
|
|||
* [Slides](https://docs.google.com/presentation/d/1bkJibjwWXy8YnB5fna6p-Ltiy-N5p01zUsA22wCNkXA/edit?usp=sharing)
|
||||
* [Plugin Interface Proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
|
||||
* [Addon Manifest](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/node-problem-detector)
|
||||
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)
|
||||
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)
|
|
@ -0,0 +1,25 @@
|
|||
# Copyright 2018 The Kubernetes Authors. All rights reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM golang:1.11.0
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update && apt-get --yes install libsystemd-dev
|
||||
RUN go version
|
||||
RUN go get github.com/tools/godep
|
||||
RUN godep version
|
||||
CMD ["/bin/bash"]
|
|
@ -1,26 +0,0 @@
|
|||
# See https://cloud.google.com/cloud-build/docs/build-config
|
||||
|
||||
# this must be specified in seconds. If omitted, defaults to 600s (10 mins)
|
||||
timeout: 3600s
|
||||
options:
|
||||
# job builds a multi-arch docker image for amd64 and arm64
|
||||
machineType: E2_HIGHCPU_8
|
||||
steps:
|
||||
- name: 'gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20230623-56e06d7c18'
|
||||
entrypoint: bash
|
||||
env:
|
||||
- PROW_GIT_TAG=$_GIT_TAG
|
||||
- PULL_BASE_REF=$_PULL_BASE_REF
|
||||
- VERSION=$_PULL_BASE_REF
|
||||
- DOCKER_CLI_EXPERIMENTAL=enabled
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
echo "Building/Pushing NPD containers"
|
||||
apk add musl-dev gcc
|
||||
make push-container
|
||||
substitutions:
|
||||
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
|
||||
# can be used as a substitution
|
||||
_GIT_TAG: 'PLACE_HOLDER'
|
||||
_PULL_BASE_REF: 'master'
|
|
@ -23,24 +23,17 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker"
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
|
||||
hco := options.NewHealthCheckerOptions()
|
||||
hco.AddFlags(pflag.CommandLine)
|
||||
|
|
|
@ -39,9 +39,7 @@ type HealthCheckerOptions struct {
|
|||
EnableRepair bool
|
||||
CriCtlPath string
|
||||
CriSocketPath string
|
||||
CriTimeout time.Duration
|
||||
CoolDownTime time.Duration
|
||||
LoopBackTime time.Duration
|
||||
HealthCheckTimeout time.Duration
|
||||
LogPatterns types.LogPatternFlag
|
||||
}
|
||||
|
@ -63,12 +61,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
"The path to the crictl binary. This is used to check health of cri component.")
|
||||
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
|
||||
"The path to the cri socket. Used with crictl to specify the socket path.")
|
||||
fs.DurationVar(&hco.CriTimeout, "cri-timeout", types.DefaultCriTimeout,
|
||||
"The duration to wait for crictl to run.")
|
||||
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
|
||||
"The duration to wait for the service to be up before attempting repair.")
|
||||
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
|
||||
"The duration to loop back, if it is 0, health-check will check from start time.")
|
||||
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
|
||||
"The time to wait before marking the component as unhealthy.")
|
||||
fs.Var(&hco.LogPatterns, "log-pattern",
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -26,24 +25,17 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/logcounter/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/logcounter"
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
|
||||
fedo := options.NewLogCounterOptions()
|
||||
fedo.AddFlags(pflag.CommandLine)
|
||||
|
|
|
@ -34,7 +34,6 @@ type LogCounterOptions struct {
|
|||
Lookback string
|
||||
Delay string
|
||||
Pattern string
|
||||
RevertPattern string
|
||||
Count int
|
||||
}
|
||||
|
||||
|
@ -47,8 +46,6 @@ func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
"The time duration log watcher delays after node boot time. This is useful when log watcher needs to wait for some time until the node is stable.")
|
||||
fs.StringVar(&fedo.Pattern, "pattern", "",
|
||||
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
|
||||
fs.StringVar(&fedo.RevertPattern, "revert-pattern", "",
|
||||
"Similar to --pattern but conversely it decreases count value for every match. This is useful to discount a log when another log occurs.")
|
||||
fs.IntVar(&fedo.Count, "count", 1,
|
||||
"The number of times the pattern must be found to trigger the condition")
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,9 +17,7 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/exporterplugins"
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/problemdaemonplugins"
|
||||
|
@ -33,7 +31,16 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
||||
func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) error {
|
||||
func npdInteractive(npdo *options.NodeProblemDetectorOptions) {
|
||||
termCh := make(chan error, 1)
|
||||
defer close(termCh)
|
||||
|
||||
if err := npdMain(npdo, termCh); err != nil {
|
||||
glog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func npdMain(npdo *options.NodeProblemDetectorOptions, termCh <-chan error) error {
|
||||
if npdo.PrintVersion {
|
||||
version.PrintVersion()
|
||||
return nil
|
||||
|
@ -46,18 +53,18 @@ func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) erro
|
|||
// Initialize problem daemons.
|
||||
problemDaemons := problemdaemon.NewProblemDaemons(npdo.MonitorConfigPaths)
|
||||
if len(problemDaemons) == 0 {
|
||||
klog.Fatalf("No problem daemon is configured")
|
||||
glog.Fatalf("No problem daemon is configured")
|
||||
}
|
||||
|
||||
// Initialize exporters.
|
||||
defaultExporters := []types.Exporter{}
|
||||
if ke := k8sexporter.NewExporterOrDie(ctx, npdo); ke != nil {
|
||||
if ke := k8sexporter.NewExporterOrDie(npdo); ke != nil {
|
||||
defaultExporters = append(defaultExporters, ke)
|
||||
klog.Info("K8s exporter started.")
|
||||
glog.Info("K8s exporter started.")
|
||||
}
|
||||
if pe := prometheusexporter.NewExporterOrDie(npdo); pe != nil {
|
||||
defaultExporters = append(defaultExporters, pe)
|
||||
klog.Info("Prometheus exporter started.")
|
||||
glog.Info("Prometheus exporter started.")
|
||||
}
|
||||
|
||||
plugableExporters := exporters.NewExporters()
|
||||
|
@ -67,10 +74,10 @@ func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) erro
|
|||
npdExporters = append(npdExporters, plugableExporters...)
|
||||
|
||||
if len(npdExporters) == 0 {
|
||||
klog.Fatalf("No exporter is successfully setup")
|
||||
glog.Fatalf("No exporter is successfully setup")
|
||||
}
|
||||
|
||||
// Initialize NPD core.
|
||||
p := problemdetector.NewProblemDetector(problemDaemons, npdExporters)
|
||||
return p.Run(ctx)
|
||||
return p.Run(termCh)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
func main() {
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
npdInteractive(npdo)
|
||||
}
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
@ -20,8 +19,9 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
@ -81,22 +81,24 @@ func TestNPDMain(t *testing.T) {
|
|||
npdo, cleanup := setupNPD(t)
|
||||
defer cleanup()
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
cancelFunc()
|
||||
if err := npdMain(ctx, npdo); err != nil {
|
||||
termCh := make(chan error, 2)
|
||||
termCh <- errors.New("close")
|
||||
defer close(termCh)
|
||||
|
||||
if err := npdMain(npdo, termCh); err != nil {
|
||||
t.Errorf("termination signal should not return error got, %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, ext string, contents string) (string, error) {
|
||||
f, err := os.CreateTemp("", "*."+ext)
|
||||
f, err := ioutil.TempFile("", "*."+ext)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("cannot create temp file, %v", err)
|
||||
}
|
||||
|
||||
fileName := f.Name()
|
||||
|
||||
if err := os.WriteFile(fileName, []byte(contents), 0644); err != nil {
|
||||
if err := ioutil.WriteFile(fileName, []byte(contents), 0644); err != nil {
|
||||
os.Remove(fileName)
|
||||
return "", fmt.Errorf("cannot write config to temp file %s, %v", fileName, err)
|
||||
}
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
if err := npdMain(context.Background(), npdo); err != nil {
|
||||
klog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
}
|
|
@ -17,17 +17,16 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"golang.org/x/sys/windows/svc"
|
||||
"golang.org/x/sys/windows/svc/debug"
|
||||
"golang.org/x/sys/windows/svc/eventlog"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
|
@ -44,18 +43,6 @@ var (
|
|||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
|
@ -75,7 +62,7 @@ func main() {
|
|||
func isRunningAsWindowsService() bool {
|
||||
runningAsService, err := svc.IsWindowsService()
|
||||
if err != nil {
|
||||
klog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
|
||||
glog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
|
||||
return false
|
||||
}
|
||||
return runningAsService
|
||||
|
@ -115,20 +102,26 @@ type npdService struct {
|
|||
}
|
||||
|
||||
func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (bool, uint32) {
|
||||
appTermCh := make(chan error, 1)
|
||||
svcLoopTermCh := make(chan error, 1)
|
||||
defer func() {
|
||||
close(appTermCh)
|
||||
close(svcLoopTermCh)
|
||||
}()
|
||||
|
||||
changes <- svc.Status{State: svc.StartPending}
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
var appWG sync.WaitGroup
|
||||
var svcWG sync.WaitGroup
|
||||
|
||||
options := s.options
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
|
||||
// NPD application goroutine.
|
||||
appWG.Add(1)
|
||||
go func() {
|
||||
defer appWG.Done()
|
||||
|
||||
if err := npdMain(ctx, options); err != nil {
|
||||
if err := npdMain(options, appTermCh); err != nil {
|
||||
elog.Warning(windowsEventLogID, err.Error())
|
||||
}
|
||||
|
||||
|
@ -139,36 +132,16 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
|
|||
svcWG.Add(1)
|
||||
go func() {
|
||||
defer svcWG.Done()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case c := <-r:
|
||||
switch c.Cmd {
|
||||
case svc.Interrogate:
|
||||
changes <- c.CurrentStatus
|
||||
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
changes <- c.CurrentStatus
|
||||
case svc.Stop, svc.Shutdown:
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
|
||||
cancelFunc()
|
||||
case svc.Pause:
|
||||
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
|
||||
case svc.Continue:
|
||||
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
default:
|
||||
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
serviceLoop(r, changes, appTermCh, svcLoopTermCh)
|
||||
}()
|
||||
|
||||
// Wait for the application go routine to die.
|
||||
appWG.Wait()
|
||||
|
||||
// Ensure that the service control loop is killed.
|
||||
svcLoopTermCh <- nil
|
||||
|
||||
// Wait for the service control loop to terminate.
|
||||
// Otherwise it's possible that the channel closures cause the application to panic.
|
||||
svcWG.Wait()
|
||||
|
@ -178,3 +151,31 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
|
|||
|
||||
return false, uint32(0)
|
||||
}
|
||||
|
||||
func serviceLoop(r <-chan svc.ChangeRequest, changes chan<- svc.Status, appTermCh chan error, svcLoopTermCh chan error) {
|
||||
for {
|
||||
select {
|
||||
case <-svcLoopTermCh:
|
||||
return
|
||||
case c := <-r:
|
||||
switch c.Cmd {
|
||||
case svc.Interrogate:
|
||||
changes <- c.CurrentStatus
|
||||
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
changes <- c.CurrentStatus
|
||||
case svc.Stop, svc.Shutdown:
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
|
||||
appTermCh <- errors.New("stopping service")
|
||||
case svc.Pause:
|
||||
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
|
||||
case svc.Continue:
|
||||
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
default:
|
||||
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_custom_plugin_monitor
|
||||
// +build !disable_custom_plugin_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_system_stats_monitor
|
||||
// +build !disable_system_stats_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -43,10 +43,6 @@ type NodeProblemDetectorOptions struct {
|
|||
ServerPort int
|
||||
// ServerAddress is the address to bind the node problem detector server.
|
||||
ServerAddress string
|
||||
// QPS is the maximum QPS to the master from client.
|
||||
QPS float32
|
||||
// Burst is the maximum burst for throttle.
|
||||
Burst int
|
||||
|
||||
// exporter options
|
||||
|
||||
|
@ -65,10 +61,6 @@ type NodeProblemDetectorOptions struct {
|
|||
APIServerWaitInterval time.Duration
|
||||
// K8sExporterHeartbeatPeriod is the period at which the k8s exporter does forcibly sync with apiserver.
|
||||
K8sExporterHeartbeatPeriod time.Duration
|
||||
// K8sExporterWriteEvents determines whether to write Kubernetes Events for problems.
|
||||
K8sExporterWriteEvents bool
|
||||
// K8sExporterUpdateNodeConditions determines whether to update Kubernetes Node Conditions for problems.
|
||||
K8sExporterUpdateNodeConditions bool
|
||||
|
||||
// prometheusExporter options
|
||||
// PrometheusServerPort is the port to bind the Prometheus scrape endpoint. Use 0 to disable.
|
||||
|
@ -121,8 +113,6 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.K8sExporterHeartbeatPeriod, "k8s-exporter-heartbeat-period", 5*time.Minute, "The period at which k8s-exporter does forcibly sync with apiserver.")
|
||||
fs.BoolVar(&npdo.K8sExporterWriteEvents, "k8s-exporter-write-events", true, "Whether to write Kubernetes Event objects with event details.")
|
||||
fs.BoolVar(&npdo.K8sExporterUpdateNodeConditions, "k8s-exporter-update-node-conditions", true, "Whether to update Kubernetes Node conditions with event details.")
|
||||
fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")
|
||||
fs.StringVar(&npdo.HostnameOverride, "hostname-override",
|
||||
"", "Custom node name used to override hostname")
|
||||
|
@ -135,8 +125,6 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
20257, "The port to bind the Prometheus scrape endpoint. Prometheus exporter is enabled by default at port 20257. Use 0 to disable.")
|
||||
fs.StringVar(&npdo.PrometheusServerAddress, "prometheus-address",
|
||||
"127.0.0.1", "The address to bind the Prometheus scrape endpoint.")
|
||||
fs.Float32Var(&npdo.QPS, "kube-api-qps", 500, "Maximum QPS to use while talking with Kubernetes API")
|
||||
fs.IntVar(&npdo.Burst, "kube-api-burst", 500, "Maximum burst for throttle while talking with Kubernetes API")
|
||||
for _, exporterName := range exporters.GetExporterNames() {
|
||||
exporterHandler := exporters.GetExporterHandlerOrDie(exporterName)
|
||||
exporterHandler.Options.SetFlags(fs)
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "KernelOops",
|
||||
"reason": "Kerneloops",
|
||||
"pattern": "System encountered a non-fatal error in \\S+"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,28 +0,0 @@
|
|||
{
|
||||
"plugin": "filelog",
|
||||
"pluginConfig": {
|
||||
"timestamp": "^.{15}",
|
||||
"message": "(?i)Currently unreadable.*sectors|(?i)Offline uncorrectable sectors",
|
||||
"timestampFormat": "Jan _2 15:04:05"
|
||||
},
|
||||
"logPath": "/var/log/messages",
|
||||
"lookback": "10h",
|
||||
"bufferSize": 1,
|
||||
"source": "disk-monitor",
|
||||
"skipList": [ " audit:", " audit[" ],
|
||||
"conditions": [
|
||||
{
|
||||
"type": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"message": "Disk no bad block"
|
||||
},
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"pattern": ".*([1-9]\\d{2,}) (Currently unreadable.*sectors|Offline uncorrectable sectors).*"
|
||||
},
|
||||
]
|
||||
}
|
|
@ -25,7 +25,6 @@
|
|||
"--component=kubelet",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=1m",
|
||||
"--loopback-time=0",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "86400s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "iptables-mode-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "IPTablesVersionsMismatch",
|
||||
"path": "./config/plugin/iptables_mode.sh",
|
||||
"timeout": "5s"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -42,6 +42,12 @@
|
|||
"reason": "KernelOops",
|
||||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
|
|
|
@ -12,14 +12,9 @@
|
|||
"message": "kernel has no deadlock"
|
||||
},
|
||||
{
|
||||
"type": "XfsShutdown",
|
||||
"reason": "XfsHasNotShutDown",
|
||||
"message": "XFS has not shutdown"
|
||||
},
|
||||
{
|
||||
"type": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareHasNoFatalError",
|
||||
"message": "UEFI CPER has no fatal error"
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
|
@ -63,38 +58,28 @@
|
|||
"reason": "IOError",
|
||||
"pattern": "Buffer I/O error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "XfsShutdown",
|
||||
"reason": "XfsHasShutdown",
|
||||
"pattern": "XFS .* Shutting down filesystem.?"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorCorrected",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: corrected$"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorRecoverable",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: recoverable$"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareErrorFatal",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: fatal$"
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
{
|
||||
"net": {
|
||||
"excludeInterfaceRegexp": "^(cali|tunl|veth)",
|
||||
"metricsConfigs": {
|
||||
"net/rx_bytes": {
|
||||
"displayName": "net/rx_bytes"
|
||||
|
|
|
@ -20,7 +20,8 @@ if systemctl -q is-active "$SERVICE"; then
|
|||
echo "$SERVICE is running"
|
||||
exit $OK
|
||||
else
|
||||
# Does not differentiate stopped/failed service from non-existent
|
||||
# Does not differenciate stopped/failed service from non-existent
|
||||
echo "$SERVICE is not running"
|
||||
exit $NONOK
|
||||
fi
|
||||
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# As of iptables 1.8, the iptables command line clients come in two different versions/modes: "legacy",
|
||||
# which uses the kernel iptables API just like iptables 1.6 and earlier did, and "nft", which translates
|
||||
# the iptables command-line API into the kernel nftables API.
|
||||
# Because they connect to two different subsystems in the kernel, you cannot mix rules from different versions.
|
||||
# Ref: https://github.com/kubernetes-sigs/iptables-wrappers
|
||||
|
||||
readonly OK=0
|
||||
readonly NONOK=1
|
||||
readonly UNKNOWN=2
|
||||
|
||||
# based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
|
||||
readonly num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -c '^-' || true)
|
||||
readonly num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -c '^-' || true)
|
||||
|
||||
|
||||
if [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Found rules from both versions, iptables-legacy: ${num_legacy_lines} iptables-nft: ${num_nft_lines}"
|
||||
echo $NONOK
|
||||
elif [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -eq 0 ]; then
|
||||
echo "Using iptables-legacy: ${num_legacy_lines} rules"
|
||||
echo $OK
|
||||
elif [ "$num_legacy_lines" -eq 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Using iptables-nft: ${num_nft_lines} rules"
|
||||
echo $OK
|
||||
else
|
||||
echo "No iptables rules found"
|
||||
echo $UNKNOWN
|
||||
fi
|
|
@ -1,23 +0,0 @@
|
|||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -44,9 +44,6 @@
|
|||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
|
@ -91,9 +88,6 @@
|
|||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
|
@ -37,8 +37,7 @@
|
|||
"--lookback=20m",
|
||||
"--delay=5m",
|
||||
"--count=5",
|
||||
"--pattern=Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet).",
|
||||
"--revert-pattern=Stopping (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)..."
|
||||
"--pattern=Started Kubernetes kubelet."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -52,8 +51,7 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)...",
|
||||
"--revert-pattern=Stopping (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
"--pattern=Starting Docker Application Container Engine..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -67,8 +65,7 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)...",
|
||||
"--revert-pattern=Stopping (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
"--pattern=Starting containerd container runtime..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
}
|
||||
|
|
|
@ -13,17 +13,17 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "KubeletStart",
|
||||
"pattern": "Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)."
|
||||
"pattern": "Started Kubernetes kubelet."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "DockerStart",
|
||||
"pattern": "Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
"pattern": "Starting Docker Application Container Engine..."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "ContainerdStart",
|
||||
"pattern": "Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
"pattern": "Starting containerd container runtime..."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ Restart=always
|
|||
RestartSec=10
|
||||
ExecStart=/home/kubernetes/bin/node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false \
|
||||
--exporter.stackdriver=/home/kubernetes/node-problem-detector/config/exporter/stackdriver-exporter.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/readonly-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.custom-plugin-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor-counter.json,/home/kubernetes/node-problem-detector/config/systemd-monitor-counter.json \
|
||||
--config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json,/home/kubernetes/node-problem-detector/config/net-cgroup-system-stats-monitor.json
|
||||
|
||||
|
|
|
@ -20,11 +20,6 @@
|
|||
"type": "temporary",
|
||||
"reason": "CorruptContainerImageLayer",
|
||||
"pattern": ".*failed to pull and unpack image.*failed to extract layer.*archive/tar: invalid tar header.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "HCSEmptyLayerchain",
|
||||
"pattern": ".*Failed to unmarshall layerchain json - invalid character '\\x00' looking for beginning of value*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "WindowsDefenderThreatsDetected",
|
||||
"path": "C:\\etc\\kubernetes\\node-problem-detector\\config\\plugin\\windows_defender_problem.ps1",
|
||||
"path": "./config/plugin/windows_defender_problem.ps1",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -44,9 +44,6 @@
|
|||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
|
@ -91,9 +88,6 @@
|
|||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,33 +53,15 @@ data:
|
|||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
readonly-monitor.json: |
|
||||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
|
|
|
@ -1,104 +0,0 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-problem-detector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
# Make sure node problem detector is in the same timezone
|
||||
# with the host.
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
- mountPath: /etc/machine-id
|
||||
name: machine-id
|
||||
readOnly: true
|
||||
- mountPath: /run/systemd/system
|
||||
name: systemd
|
||||
- mountPath: /var/run/dbus/
|
||||
name: dbus
|
||||
mountPropagation: Bidirectional
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: config
|
||||
configMap:
|
||||
name: node-problem-detector-config
|
||||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
- name: machine-id
|
||||
hostPath:
|
||||
path: /etc/machine-id
|
||||
type: "File"
|
||||
- name: systemd
|
||||
hostPath:
|
||||
path: /run/systemd/system/
|
||||
type: ""
|
||||
- name: dbus
|
||||
hostPath:
|
||||
path: /var/run/dbus/
|
||||
type: ""
|
|
@ -28,8 +28,8 @@ spec:
|
|||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
|
||||
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
|
@ -60,7 +60,6 @@ spec:
|
|||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
serviceAccountName: node-problem-detector
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
|
@ -78,8 +77,6 @@ spec:
|
|||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
tolerations:
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
|
@ -1,62 +1,9 @@
|
|||
# Custom Plugin Monitor
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin Config
|
||||
|
||||
* `invoke_interval`: Interval at which custom plugins will be invoked.
|
||||
* `timeout`: Time after which custom plugins invocation will be terminated and considered timeout.
|
||||
* `timeout`: Time after which custom plugins invokation will be terminated and considered timeout.
|
||||
* `max_output_length`: The maximum standard output size from custom plugins that NPD will be cut and use for condition status message.
|
||||
* `concurrency`: The plugin worker number, i.e., how many custom plugins will be invoked concurrently.
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
||||
* `skip_initial_status`: Flag controls whether condition will be emitted during plugin initialization.
|
||||
|
||||
### Annotated Plugin Configuration Example
|
||||
|
||||
```
|
||||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "30s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 3,
|
||||
"enable_message_change_based_condition_update": false
|
||||
},
|
||||
"source": "ntp-custom-plugin-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "NTPProblem",
|
||||
"reason": "NTPIsUp", // This is the default reason shown when healthy
|
||||
"message": "ntp service is up" // This is the default message shown when healthy
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary", // These are not shown unless there's an
|
||||
// event so they always relate to a problem.
|
||||
// There are no defaults since there is nothing
|
||||
// to show unless there's a problem.
|
||||
"reason": "NTPIsDown", // This is the reason shown for this event
|
||||
// and the message shown comes from stdout.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
},
|
||||
{
|
||||
"type": "permanent", // These are permanent and are shown in the Conditions section
|
||||
// when running `kubectl describe node ...`
|
||||
// They have default values shown above in the conditions section
|
||||
// and also a reason for each specific trigger listed in this rules section.
|
||||
// Message will come from default for healthy times
|
||||
// and during unhealthy time message comes from stdout of the check.
|
||||
|
||||
"condition": "NTPProblem", // This is the key to connect to the corresponding condition listed above
|
||||
"reason": "NTPIsDown", // and the reason shown for failures detected in this rule
|
||||
// and message will be from stdout of the check.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
|
@ -4,12 +4,6 @@ These are notes to help follow a consistent release process. See something
|
|||
important missing? Please submit a pull request to add anything else that would
|
||||
be useful!
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Ensure access to the container image [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
Add email to `k8s-infra-staging-npd` group in sig-node [groups.yaml](https://github.com/kubernetes/k8s.io/blob/main/groups/sig-node/groups.yaml).
|
||||
See example https://github.com/kubernetes/k8s.io/pull/1599.
|
||||
|
||||
## Preparing for a release
|
||||
|
||||
There are a few steps that should be taken prior to creating the actual release
|
||||
|
@ -17,100 +11,37 @@ itself.
|
|||
|
||||
1. Collect changes since last release. This can be done by looking directly at
|
||||
merged commit messages (``git log [last_release_tag]...HEAD``), or by
|
||||
viewing the changes on GitHub (example: https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...master).
|
||||
viewing the changes on GitHub ([example:
|
||||
https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master](https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master)).
|
||||
|
||||
2. Based on the changes to be included in the release, determine what the next
|
||||
1. Based on the changes to be included in the release, determine what the next
|
||||
release number should be. We strive to follow [SemVer](https://semver.org/)
|
||||
as much as possible.
|
||||
|
||||
3. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
|
||||
1. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
|
||||
with all significant changes.
|
||||
|
||||
## Create release
|
||||
|
||||
### Create the new version tag
|
||||
|
||||
#### Option 1
|
||||
```
|
||||
# Use v0.8.17 as an example.
|
||||
git clone git@github.com:kubernetes/node-problem-detector.git
|
||||
cd node-problem-detector/
|
||||
git tag v0.8.17
|
||||
git push origin v0.8.17
|
||||
```
|
||||
|
||||
#### Option 2
|
||||
Update [version.txt](https://github.com/kubernetes/node-problem-detector/blob/master/version.txt)
|
||||
(example https://github.com/kubernetes/node-problem-detector/pull/869).
|
||||
|
||||
### Build and push artifacts
|
||||
This step builds the NPD into container files and tar files.
|
||||
- The container file is pushed to the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
You will promote the new image to registry.k8s.io later.
|
||||
- The tar files are generated locally. You will upload those to github in the
|
||||
release note later.
|
||||
|
||||
**Note: You need the access mentioned in the [prerequisites](#prerequisites)
|
||||
section to perform steps in this section.**
|
||||
|
||||
```
|
||||
# One-time setup
|
||||
sudo apt-get install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
|
||||
cd node-problem-detector
|
||||
make release
|
||||
|
||||
# Get SHA256 of the tar files. For example
|
||||
sha256sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Get MD5 of the tar files. For example
|
||||
md5sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Verify container image in staging registry and get SHA256.
|
||||
docker pull gcr.io/k8s-staging-npd/node-problem-detector:v0.8.17
|
||||
docker image ls gcr.io/k8s-staging-npd/node-problem-detector --digests
|
||||
```
|
||||
|
||||
### Promote new NPD image to registry.k8s.io
|
||||
1. Get the SHA256 from the new NPD image from the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector)
|
||||
or previous step.
|
||||
2. Promote the NPD image to registry.k8s.io ([images.yaml](https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-npd/images.yaml), example https://github.com/kubernetes/k8s.io/pull/6523).
|
||||
3. Verify the container image.
|
||||
```
|
||||
docker pull registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
docker image ls registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
```
|
||||
|
||||
### Create the release note
|
||||
|
||||
Go to https://github.com/kubernetes/node-problem-detector/releases, draft a new
|
||||
release note and publish. Make sure to include the following in the body of the
|
||||
release note:
|
||||
Once changes have been merged to the CHANGELOG, perform the actual release via
|
||||
GitHub. When creating the release, make sure to include the following in the
|
||||
body of the release:
|
||||
|
||||
1. For convenience, add a link to easily view the changes since the last
|
||||
release (e.g.
|
||||
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17](https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17)).
|
||||
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6](https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6)).
|
||||
|
||||
2. There is no need to duplicate everything from the CHANGELOG, but include the
|
||||
1. There is no need to duplicate everything from the CHANGELOG, but include the
|
||||
most significant things so someone just viewing the release entry will have
|
||||
an idea of what it includes.
|
||||
|
||||
3. Provide a link to the new image release (e.g. `Image:
|
||||
registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17`)
|
||||
|
||||
4. Upload the tar files built from [pevious step](#build-and-push-artifacts),
|
||||
and include the SHA and MD5.
|
||||
1. Provide a link to the new image release (e.g. `Image:
|
||||
k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6`)
|
||||
|
||||
## Post release steps
|
||||
|
||||
1. Update image version in node-problem-detector repo, so anyone deploying
|
||||
directly from the repo deployment file will get the newest image deployed.
|
||||
Example https://github.com/kubernetes/node-problem-detector/pull/897.
|
||||
1. Update image version in
|
||||
[deployment/node-problem-detector.yaml](https://github.com/kubernetes/node-problem-detector/blob/422c088d623488be33aa697588655440c4e6a063/deployment/node-problem-detector.yaml#L32).
|
||||
|
||||
2. Update the NPD version in [kubernetes/kubernetes](https://github.com/kubernetes/kubernetes)
|
||||
repo, so that kubernetes clusters use the new NPD version. Example
|
||||
https://github.com/kubernetes/kubernetes/pull/123740.
|
||||
Update the image version in the deployment file so anyone deploying directly
|
||||
from the repo deployment file will get the newest image deployed.
|
||||
|
|
135
go.mod
135
go.mod
|
@ -1,110 +1,41 @@
|
|||
module k8s.io/node-problem-detector
|
||||
|
||||
go 1.24.2
|
||||
go 1.15
|
||||
|
||||
require (
|
||||
cloud.google.com/go/compute/metadata v0.6.0
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.4.2
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.13.14
|
||||
github.com/acobaugh/osrelease v0.1.0
|
||||
github.com/avast/retry-go/v4 v4.6.1
|
||||
github.com/coreos/go-systemd/v22 v22.5.0
|
||||
cloud.google.com/go v0.45.1
|
||||
code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.0.0-20190427222117-f6cda26f80a3
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.13.4
|
||||
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
|
||||
github.com/avast/retry-go v2.4.1+incompatible
|
||||
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249
|
||||
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
|
||||
github.com/euank/go-kmsg-parser v2.0.0+incompatible
|
||||
github.com/go-ole/go-ole v1.2.4 // indirect
|
||||
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
|
||||
github.com/google/cadvisor v0.36.0
|
||||
github.com/hpcloud/tail v1.0.0
|
||||
github.com/prometheus/client_model v0.6.2
|
||||
github.com/prometheus/common v0.63.0
|
||||
github.com/prometheus/procfs v0.16.1
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
github.com/spf13/pflag v1.0.6
|
||||
github.com/stretchr/testify v1.10.0
|
||||
go.opencensus.io v0.24.0
|
||||
golang.org/x/sys v0.32.0
|
||||
google.golang.org/api v0.230.0
|
||||
k8s.io/api v0.33.0
|
||||
k8s.io/apimachinery v0.33.0
|
||||
k8s.io/client-go v0.33.0
|
||||
k8s.io/klog/v2 v2.130.1
|
||||
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e
|
||||
github.com/onsi/ginkgo v1.10.3
|
||||
github.com/onsi/gomega v1.7.1
|
||||
github.com/pborman/uuid v1.2.0
|
||||
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4
|
||||
github.com/prometheus/common v0.4.1
|
||||
github.com/prometheus/procfs v0.2.0
|
||||
github.com/shirou/gopsutil v2.19.12+incompatible
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/stretchr/testify v1.6.1
|
||||
github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc // indirect
|
||||
go.opencensus.io v0.22.4
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
|
||||
golang.org/x/sys v0.0.0-20201211090839-8ad439b19e0f
|
||||
google.golang.org/api v0.10.0
|
||||
k8s.io/api v0.0.0-20190816222004-e3a6b8045b0b
|
||||
k8s.io/apimachinery v0.0.0-20190816221834-a9f1d8a9c101
|
||||
k8s.io/client-go v11.0.1-0.20190805182717-6502b5e7b1b5+incompatible
|
||||
k8s.io/heapster v0.0.0-20180704153620-b25f8a16208f
|
||||
k8s.io/kubernetes v1.14.6
|
||||
k8s.io/test-infra v0.0.0-20190914015041-e1cbc3ccd91c
|
||||
)
|
||||
|
||||
require (
|
||||
cloud.google.com/go/auth v0.16.0 // indirect
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
|
||||
cloud.google.com/go/monitoring v1.20.3 // indirect
|
||||
cloud.google.com/go/trace v1.10.11 // indirect
|
||||
github.com/aws/aws-sdk-go v1.44.72 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
|
||||
github.com/go-kit/log v0.2.1 // indirect
|
||||
github.com/go-logfmt/logfmt v0.5.1 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/gnostic-models v0.6.9 // indirect
|
||||
github.com/google/go-cmp v0.7.0 // indirect
|
||||
github.com/google/s2a-go v0.1.9 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
|
||||
github.com/jmespath/go-jmespath v0.4.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.9 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/prometheus/client_golang v1.20.4 // indirect
|
||||
github.com/prometheus/prometheus v0.35.0 // indirect
|
||||
github.com/prometheus/statsd_exporter v0.22.7 // indirect
|
||||
github.com/shoenig/go-m1cpu v0.1.6 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||
go.opentelemetry.io/otel v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.35.0 // indirect
|
||||
golang.org/x/crypto v0.37.0 // indirect
|
||||
golang.org/x/net v0.39.0 // indirect
|
||||
golang.org/x/oauth2 v0.29.0 // indirect
|
||||
golang.org/x/sync v0.13.0 // indirect
|
||||
golang.org/x/term v0.31.0 // indirect
|
||||
golang.org/x/text v0.24.0 // indirect
|
||||
golang.org/x/time v0.11.0 // indirect
|
||||
google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect
|
||||
google.golang.org/grpc v1.72.0 // indirect
|
||||
google.golang.org/protobuf v1.36.6 // indirect
|
||||
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
|
||||
gopkg.in/fsnotify.v1 v1.4.7 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
|
||||
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
|
||||
sigs.k8s.io/randfill v1.0.0 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
)
|
||||
replace git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
VERSION="$1"
|
||||
|
||||
NPD_LINUX_AMD64=node-problem-detector-${VERSION}-linux_amd64.tar.gz
|
||||
NPD_LINUX_ARM64=node-problem-detector-${VERSION}-linux_arm64.tar.gz
|
||||
NPD_WINDOWS_AMD64=node-problem-detector-${VERSION}-windows_amd64.tar.gz
|
||||
|
||||
SHA_NPD_LINUX_AMD64=$(sha256sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
SHA_NPD_LINUX_ARM64=$(sha256sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
SHA_NPD_WINDOWS_AMD64=$(sha256sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
MD5_NPD_LINUX_AMD64=$(md5sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
MD5_NPD_LINUX_ARM64=$(md5sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
MD5_NPD_WINDOWS_AMD64=$(md5sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
echo
|
||||
echo **${NPD_LINUX_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_AMD64}
|
||||
echo
|
||||
echo **${NPD_LINUX_ARM64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_ARM64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_ARM64}
|
||||
echo
|
||||
echo **${NPD_WINDOWS_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_WINDOWS_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_WINDOWS_AMD64}
|
|
@ -1,32 +0,0 @@
|
|||
#!/bin/bash -xe
|
||||
|
||||
# Copyright 2023 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
VERSION=$(cat version.txt)
|
||||
|
||||
if [[ ! "${VERSION}" =~ ^v([0-9]+[.][0-9]+)[.]([0-9]+)(-(alpha|beta)[.]([0-9]+))?$ ]]; then
|
||||
echo "Version ${VERSION} must be 'X.Y.Z', 'X.Y.Z-alpha.N', or 'X.Y.Z-beta.N'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$(git tag -l "${VERSION}")" ]; then
|
||||
echo "Tag ${VERSION} already exists"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git tag -a -m "Release ${VERSION}" "${VERSION}"
|
||||
git push origin "${VERSION}"
|
||||
|
||||
echo "release_tag=refs/tags/${VERSION}" >> $GITHUB_OUTPUT
|
|
@ -1,30 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
make gomod
|
||||
changes=$(git status --porcelain go.mod go.sum vendor/ tests/e2e/go.mod tests/e2e/go.sum || true)
|
||||
if [ -n "${changes}" ]; then
|
||||
echo "ERROR: go modules are not up to date; please run: make gomod"
|
||||
echo "changed files:"
|
||||
printf "%s" "${changes}\n"
|
||||
echo "git diff:"
|
||||
git --no-pager diff
|
||||
exit 1
|
||||
fi
|
|
@ -18,10 +18,10 @@ package custompluginmonitor
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"io/ioutil"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
|
@ -47,6 +47,7 @@ type customPluginMonitor struct {
|
|||
config cpmtypes.CustomPluginConfig
|
||||
conditions []types.Condition
|
||||
plugin *plugin.Plugin
|
||||
resultChan <-chan cpmtypes.Result
|
||||
statusChan chan *types.Status
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
@ -57,27 +58,27 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
|
|||
configPath: configPath,
|
||||
tomb: tomb.NewTomb(),
|
||||
}
|
||||
f, err := os.ReadFile(configPath)
|
||||
f, err := ioutil.ReadFile(configPath)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &c.config)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
}
|
||||
// Apply configurations
|
||||
err = (&c.config).ApplyConfiguration()
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
}
|
||||
|
||||
// Validate configurations
|
||||
err = c.config.Validate()
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
}
|
||||
|
||||
klog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
glog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
|
||||
c.plugin = plugin.NewPlugin(c.config)
|
||||
// A 1000 size channel should be big enough.
|
||||
|
@ -96,39 +97,32 @@ func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
|
|||
if rule.Type == types.Perm {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
rule.Condition, rule.Reason, err)
|
||||
}
|
||||
}
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
|
||||
klog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
glog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
go c.plugin.Run()
|
||||
go c.monitorLoop()
|
||||
return c.statusChan, nil
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Stop() {
|
||||
klog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
glog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
c.tomb.Stop()
|
||||
}
|
||||
|
||||
// monitorLoop is the main loop of customPluginMonitor.
|
||||
// there is one customPluginMonitor, one plugin instance for each configPath.
|
||||
// each runs rules in parallel at pre-configured concurrency, and interval.
|
||||
func (c *customPluginMonitor) monitorLoop() {
|
||||
c.initializeConditions()
|
||||
if *c.config.PluginGlobalConfig.SkipInitialStatus {
|
||||
klog.Infof("Skipping sending initial status. Using default conditions: %+v", c.conditions)
|
||||
} else {
|
||||
c.sendInitialStatus()
|
||||
}
|
||||
c.initializeStatus()
|
||||
|
||||
resultChan := c.plugin.GetResultChan()
|
||||
|
||||
|
@ -136,16 +130,16 @@ func (c *customPluginMonitor) monitorLoop() {
|
|||
select {
|
||||
case result, ok := <-resultChan:
|
||||
if !ok {
|
||||
klog.Errorf("Result channel closed: %s", c.configPath)
|
||||
glog.Errorf("Result channel closed: %s", c.configPath)
|
||||
return
|
||||
}
|
||||
klog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
status := c.generateStatus(result)
|
||||
klog.V(3).Infof("New status generated: %+v", status)
|
||||
glog.V(3).Infof("New status generated: %+v", status)
|
||||
c.statusChan <- status
|
||||
case <-c.tomb.Stopping():
|
||||
c.plugin.Stop()
|
||||
klog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
c.tomb.Done()
|
||||
return
|
||||
}
|
||||
|
@ -238,7 +232,6 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
condition.Type,
|
||||
status,
|
||||
newReason,
|
||||
newMessage,
|
||||
timestamp,
|
||||
)
|
||||
|
||||
|
@ -259,7 +252,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
|
||||
event.Reason, 1)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
glog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
event.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -267,7 +260,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
|
||||
condition.Type, condition.Reason, condition.Status == types.True)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
condition.Type, condition.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -280,7 +273,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
}
|
||||
// Log only if condition has changed
|
||||
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
|
||||
klog.V(0).Infof("New status generated: %+v", status)
|
||||
glog.V(0).Infof("New status generated: %+v", status)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
@ -296,9 +289,11 @@ func toConditionStatus(s cpmtypes.Status) types.ConditionStatus {
|
|||
}
|
||||
}
|
||||
|
||||
// sendInitialStatus sends the initial status to the node problem detector.
|
||||
func (c *customPluginMonitor) sendInitialStatus() {
|
||||
klog.Infof("Sending initial status for %s with conditions: %+v", c.config.Source, c.conditions)
|
||||
// initializeStatus initializes the internal condition and also reports it to the node problem detector.
|
||||
func (c *customPluginMonitor) initializeStatus() {
|
||||
// Initialize the default node conditions
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
glog.Infof("Initialize condition generated: %+v", c.conditions)
|
||||
// Update the initial status
|
||||
c.statusChan <- &types.Status{
|
||||
Source: c.config.Source,
|
||||
|
@ -306,12 +301,6 @@ func (c *customPluginMonitor) sendInitialStatus() {
|
|||
}
|
||||
}
|
||||
|
||||
// initializeConditions initializes the internal node conditions.
|
||||
func (c *customPluginMonitor) initializeConditions() {
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
klog.Infof("Initialized conditions for %s: %+v", c.configPath, c.conditions)
|
||||
}
|
||||
|
||||
func initialConditions(defaults []types.Condition) []types.Condition {
|
||||
conditions := make([]types.Condition, len(defaults))
|
||||
copy(conditions, defaults)
|
||||
|
|
|
@ -20,13 +20,14 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
"k8s.io/node-problem-detector/pkg/util/tomb"
|
||||
|
@ -60,7 +61,7 @@ func (p *Plugin) GetResultChan() <-chan cpmtypes.Result {
|
|||
|
||||
func (p *Plugin) Run() {
|
||||
defer func() {
|
||||
klog.Info("Stopping plugin execution")
|
||||
glog.Info("Stopping plugin execution")
|
||||
close(p.resultChan)
|
||||
p.tomb.Done()
|
||||
}()
|
||||
|
@ -89,10 +90,9 @@ func (p *Plugin) Run() {
|
|||
|
||||
// run each rule in parallel and wait for them to complete
|
||||
func (p *Plugin) runRules() {
|
||||
klog.V(3).Info("Start to run custom plugins")
|
||||
glog.V(3).Info("Start to run custom plugins")
|
||||
|
||||
for _, rule := range p.config.Rules {
|
||||
// syncChan limits concurrent goroutines to configured PluginGlobalConfig.Concurrency value
|
||||
p.syncChan <- struct{}{}
|
||||
p.Add(1)
|
||||
go func(rule *cpmtypes.CustomRule) {
|
||||
|
@ -103,12 +103,8 @@ func (p *Plugin) runRules() {
|
|||
|
||||
start := time.Now()
|
||||
exitStatus, message := p.run(*rule)
|
||||
level := klog.Level(3)
|
||||
if exitStatus != 0 {
|
||||
level = klog.Level(2)
|
||||
}
|
||||
|
||||
klog.V(level).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
glog.V(3).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
|
||||
result := cpmtypes.Result{
|
||||
Rule: rule,
|
||||
|
@ -116,27 +112,26 @@ func (p *Plugin) runRules() {
|
|||
Message: message,
|
||||
}
|
||||
|
||||
// pipes result into resultChan which customPluginMonitor instance generates status from
|
||||
p.resultChan <- result
|
||||
|
||||
// Let the result be logged at a higher verbosity level. If there is a change in status it is logged later.
|
||||
klog.V(level).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
glog.V(3).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
}(rule)
|
||||
}
|
||||
|
||||
p.Wait()
|
||||
klog.V(3).Info("Finish running custom plugins")
|
||||
glog.V(3).Info("Finish running custom plugins")
|
||||
}
|
||||
|
||||
// readFromReader reads the maxBytes from the reader and drains the rest.
|
||||
func readFromReader(reader io.ReadCloser, maxBytes int64) ([]byte, error) {
|
||||
limitReader := io.LimitReader(reader, maxBytes)
|
||||
data, err := io.ReadAll(limitReader)
|
||||
data, err := ioutil.ReadAll(limitReader)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
// Drain the reader
|
||||
if _, err := io.Copy(io.Discard, reader); err != nil {
|
||||
if _, err := io.Copy(ioutil.Discard, reader); err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
return data, nil
|
||||
|
@ -157,16 +152,16 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
|
||||
stdoutPipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
klog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
glog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stdout pipe for plugin. Please check the error log"
|
||||
}
|
||||
stderrPipe, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
klog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
glog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stderr pipe for plugin. Please check the error log"
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
klog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
glog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error in starting plugin. Please check the error log"
|
||||
}
|
||||
|
||||
|
@ -182,9 +177,9 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
if ctx.Err() == context.Canceled {
|
||||
return
|
||||
}
|
||||
klog.Errorf("Error in running plugin timeout %q", rule.Path)
|
||||
glog.Errorf("Error in running plugin timeout %q", rule.Path)
|
||||
if cmd.Process == nil || cmd.Process.Pid == 0 {
|
||||
klog.Errorf("Error in cmd.Process check %q", rule.Path)
|
||||
glog.Errorf("Error in cmd.Process check %q", rule.Path)
|
||||
break
|
||||
}
|
||||
|
||||
|
@ -194,7 +189,7 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
|
||||
err := util.Kill(cmd)
|
||||
if err != nil {
|
||||
klog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
|
||||
glog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
|
||||
}
|
||||
case <-waitChan:
|
||||
return
|
||||
|
@ -223,18 +218,18 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
wg.Wait()
|
||||
|
||||
if stdoutErr != nil {
|
||||
klog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
glog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stdout for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if stderrErr != nil {
|
||||
klog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
glog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stderr for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
if _, ok := err.(*exec.ExitError); !ok {
|
||||
klog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
glog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
return cpmtypes.Unknown, "Error in waiting for plugin. Please check the error log"
|
||||
}
|
||||
}
|
||||
|
@ -273,12 +268,12 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
// Stop the plugin.
|
||||
func (p *Plugin) Stop() {
|
||||
p.tomb.Stop()
|
||||
klog.Info("Stop plugin execution")
|
||||
glog.Info("Stop plugin execution")
|
||||
}
|
||||
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel klog.Level) {
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel glog.Level) {
|
||||
if len(logs) != 0 {
|
||||
klog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
klog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
glog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
glog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,6 @@ var (
|
|||
defaultConcurrency = 3
|
||||
defaultMessageChangeBasedConditionUpdate = false
|
||||
defaultEnableMetricsReporting = true
|
||||
defaultSkipInitialStatus = false
|
||||
|
||||
customPluginName = "custom"
|
||||
)
|
||||
|
@ -53,11 +52,9 @@ type pluginGlobalConfig struct {
|
|||
Concurrency *int `json:"concurrency,omitempty"`
|
||||
// EnableMessageChangeBasedConditionUpdate indicates whether NPD should enable message change based condition update.
|
||||
EnableMessageChangeBasedConditionUpdate *bool `json:"enable_message_change_based_condition_update,omitempty"`
|
||||
// SkipInitialStatus prevents the first status update with default conditions
|
||||
SkipInitialStatus *bool `json:"skip_initial_status,omitempty"`
|
||||
}
|
||||
|
||||
// CustomPluginConfig is the configuration of custom plugin monitor.
|
||||
// Custom plugin config is the configuration of custom plugin monitor.
|
||||
type CustomPluginConfig struct {
|
||||
// Plugin is the name of plugin which is currently used.
|
||||
// Currently supported: custom.
|
||||
|
@ -108,10 +105,6 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
|
|||
cpc.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate = &defaultMessageChangeBasedConditionUpdate
|
||||
}
|
||||
|
||||
if cpc.PluginGlobalConfig.SkipInitialStatus == nil {
|
||||
cpc.PluginGlobalConfig.SkipInitialStatus = &defaultSkipInitialStatus
|
||||
}
|
||||
|
||||
for _, rule := range cpc.Rules {
|
||||
if rule.TimeoutString != nil {
|
||||
timeout, err := time.ParseDuration(*rule.TimeoutString)
|
||||
|
|
|
@ -33,7 +33,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
concurrency := 2
|
||||
messageChangeBasedConditionUpdate := true
|
||||
disableMetricsReporting := false
|
||||
disableInitialStatusUpdate := true
|
||||
|
||||
ruleTimeout := 1 * time.Second
|
||||
ruleTimeoutString := ruleTimeout.String()
|
||||
|
@ -63,7 +62,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
Rules: []*CustomRule{
|
||||
|
@ -93,7 +91,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -113,7 +110,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -133,7 +129,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &maxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -153,7 +148,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &concurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -173,7 +167,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &messageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -191,30 +184,10 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &disableMetricsReporting,
|
||||
},
|
||||
},
|
||||
"disable status update during initialization": {
|
||||
Orig: CustomPluginConfig{PluginGlobalConfig: pluginGlobalConfig{
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
},
|
||||
Wanted: CustomPluginConfig{
|
||||
PluginGlobalConfig: pluginGlobalConfig{
|
||||
InvokeIntervalString: &defaultInvokeIntervalString,
|
||||
InvokeInterval: &defaultInvokeInterval,
|
||||
TimeoutString: &defaultGlobalTimeoutString,
|
||||
Timeout: &defaultGlobalTimeout,
|
||||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for desp, utMeta := range utMetas {
|
||||
|
|
|
@ -17,9 +17,8 @@ limitations under the License.
|
|||
package types
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Status int
|
||||
|
|
|
@ -17,7 +17,6 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -26,10 +25,10 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/types"
|
||||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/utils/clock"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -50,7 +49,7 @@ const (
|
|||
// not. This addresses 3).
|
||||
type ConditionManager interface {
|
||||
// Start starts the condition manager.
|
||||
Start(ctx context.Context)
|
||||
Start()
|
||||
// UpdateCondition updates a specific condition.
|
||||
UpdateCondition(types.Condition)
|
||||
// GetConditions returns all current conditions.
|
||||
|
@ -68,7 +67,7 @@ type conditionManager struct {
|
|||
// No lock is needed in `sync`, because it is in the same goroutine with the
|
||||
// write operation.
|
||||
sync.RWMutex
|
||||
clock clock.WithTicker
|
||||
clock clock.Clock
|
||||
latestTry time.Time
|
||||
resyncNeeded bool
|
||||
client problemclient.Client
|
||||
|
@ -79,18 +78,18 @@ type conditionManager struct {
|
|||
}
|
||||
|
||||
// NewConditionManager creates a condition manager.
|
||||
func NewConditionManager(client problemclient.Client, clockInUse clock.WithTicker, heartbeatPeriod time.Duration) ConditionManager {
|
||||
func NewConditionManager(client problemclient.Client, clock clock.Clock, heartbeatPeriod time.Duration) ConditionManager {
|
||||
return &conditionManager{
|
||||
client: client,
|
||||
clock: clockInUse,
|
||||
clock: clock,
|
||||
updates: make(map[string]types.Condition),
|
||||
conditions: make(map[string]types.Condition),
|
||||
heartbeatPeriod: heartbeatPeriod,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *conditionManager) Start(ctx context.Context) {
|
||||
go c.syncLoop(ctx)
|
||||
func (c *conditionManager) Start() {
|
||||
go c.syncLoop()
|
||||
}
|
||||
|
||||
func (c *conditionManager) UpdateCondition(condition types.Condition) {
|
||||
|
@ -111,17 +110,15 @@ func (c *conditionManager) GetConditions() []types.Condition {
|
|||
return conditions
|
||||
}
|
||||
|
||||
func (c *conditionManager) syncLoop(ctx context.Context) {
|
||||
func (c *conditionManager) syncLoop() {
|
||||
ticker := c.clock.NewTicker(updatePeriod)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C():
|
||||
if c.needUpdates() || c.needResync() || c.needHeartbeat() {
|
||||
c.sync(ctx)
|
||||
c.sync()
|
||||
}
|
||||
case <-ctx.Done():
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -153,16 +150,16 @@ func (c *conditionManager) needHeartbeat() bool {
|
|||
}
|
||||
|
||||
// sync synchronizes node conditions with the apiserver.
|
||||
func (c *conditionManager) sync(ctx context.Context) {
|
||||
func (c *conditionManager) sync() {
|
||||
c.latestTry = c.clock.Now()
|
||||
c.resyncNeeded = false
|
||||
conditions := []v1.NodeCondition{}
|
||||
for i := range c.conditions {
|
||||
conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))
|
||||
}
|
||||
if err := c.client.SetConditions(ctx, conditions); err != nil {
|
||||
if err := c.client.SetConditions(conditions); err != nil {
|
||||
// The conditions will be updated again in future sync
|
||||
klog.Errorf("failed to update node conditions: %v", err)
|
||||
glog.Errorf("failed to update node conditions: %v", err)
|
||||
c.resyncNeeded = true
|
||||
return
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -29,14 +28,14 @@ import (
|
|||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
)
|
||||
|
||||
const heartbeatPeriod = 1 * time.Minute
|
||||
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *testclock.FakeClock) {
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *clock.FakeClock) {
|
||||
fakeClient := problemclient.NewFakeProblemClient()
|
||||
fakeClock := testclock.NewFakeClock(time.Now())
|
||||
fakeClock := clock.NewFakeClock(time.Now())
|
||||
manager := NewConditionManager(fakeClient, fakeClock, heartbeatPeriod)
|
||||
return manager.(*conditionManager), fakeClient, fakeClock
|
||||
}
|
||||
|
@ -110,7 +109,7 @@ func TestResync(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync(context.Background())
|
||||
m.sync()
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
@ -119,7 +118,7 @@ func TestResync(t *testing.T) {
|
|||
assert.False(t, m.needResync(), "Should not resync after resync period without resync needed")
|
||||
|
||||
fakeClient.InjectError("SetConditions", fmt.Errorf("injected error"))
|
||||
m.sync(context.Background())
|
||||
m.sync()
|
||||
|
||||
assert.False(t, m.needResync(), "Should not resync before resync period")
|
||||
fakeClock.Step(resyncPeriod)
|
||||
|
@ -130,7 +129,7 @@ func TestHeartbeat(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync(context.Background())
|
||||
m.sync()
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
|
|
@ -17,16 +17,15 @@ limitations under the License.
|
|||
package k8sexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/pprof"
|
||||
_ "net/http/pprof"
|
||||
"strconv"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/exporters/k8sexporter/condition"
|
||||
|
@ -38,8 +37,6 @@ import (
|
|||
type k8sExporter struct {
|
||||
client problemclient.Client
|
||||
conditionManager condition.ConditionManager
|
||||
writeEvents bool
|
||||
updateConditions bool
|
||||
}
|
||||
|
||||
// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting,
|
||||
|
@ -47,41 +44,35 @@ type k8sExporter struct {
|
|||
//
|
||||
// Note that this function may be blocked (until a timeout occurs) before
|
||||
// kube-apiserver becomes ready.
|
||||
func NewExporterOrDie(ctx context.Context, npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
if !npdo.EnableK8sExporter {
|
||||
return nil
|
||||
}
|
||||
|
||||
c := problemclient.NewClientOrDie(npdo)
|
||||
|
||||
klog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(ctx, c, npdo); err != nil {
|
||||
klog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil {
|
||||
glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
}
|
||||
|
||||
ke := k8sExporter{
|
||||
client: c,
|
||||
conditionManager: condition.NewConditionManager(c, clock.RealClock{}, npdo.K8sExporterHeartbeatPeriod),
|
||||
writeEvents: npdo.K8sExporterWriteEvents,
|
||||
updateConditions: npdo.K8sExporterUpdateNodeConditions,
|
||||
}
|
||||
|
||||
ke.startHTTPReporting(npdo)
|
||||
ke.conditionManager.Start(ctx)
|
||||
ke.conditionManager.Start()
|
||||
|
||||
return &ke
|
||||
}
|
||||
|
||||
func (ke *k8sExporter) ExportProblems(status *types.Status) {
|
||||
if ke.writeEvents {
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
}
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
}
|
||||
if ke.updateConditions {
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
}
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,30 +94,22 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio
|
|||
util.ReturnHTTPJson(w, ke.conditionManager.GetConditions())
|
||||
})
|
||||
|
||||
// register pprof
|
||||
mux.HandleFunc("/debug/pprof/", pprof.Index)
|
||||
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
|
||||
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
|
||||
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
|
||||
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
|
||||
|
||||
addr := net.JoinHostPort(npdo.ServerAddress, strconv.Itoa(npdo.ServerPort))
|
||||
go func() {
|
||||
err := http.ListenAndServe(addr, mux)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to start server: %v", err)
|
||||
glog.Fatalf("Failed to start server: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func waitForAPIServerReadyWithTimeout(ctx context.Context, c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollUntilContextTimeout(ctx, npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, true, func(ctx context.Context) (done bool, err error) {
|
||||
func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) {
|
||||
// If NPD can get the node object from kube-apiserver, the server is
|
||||
// ready and the RBAC permission is set correctly.
|
||||
if _, err := c.GetNode(ctx); err != nil {
|
||||
klog.Errorf("Can't get node object: %v", err)
|
||||
return false, err
|
||||
if _, err := c.GetNode(); err == nil {
|
||||
return true, nil
|
||||
}
|
||||
return true, nil
|
||||
return false, nil
|
||||
})
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
@ -61,7 +60,7 @@ func (f *FakeProblemClient) AssertConditions(expected []v1.NodeCondition) error
|
|||
}
|
||||
|
||||
// SetConditions is a fake mimic of SetConditions, it only update the internal condition cache.
|
||||
func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.NodeCondition) error {
|
||||
func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["SetConditions"]; ok {
|
||||
|
@ -74,7 +73,7 @@ func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.N
|
|||
}
|
||||
|
||||
// GetConditions is a fake mimic of GetConditions, it returns the conditions cached internally.
|
||||
func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["GetConditions"]; ok {
|
||||
|
@ -94,6 +93,6 @@ func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeCo
|
|||
func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) {
|
||||
}
|
||||
|
||||
func (f *FakeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
func (f *FakeProblemClient) GetNode() (*v1.Node, error) {
|
||||
return nil, fmt.Errorf("GetNode() not implemented")
|
||||
}
|
||||
|
|
|
@ -17,24 +17,24 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/heapster/common/kubernetes"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
@ -42,14 +42,14 @@ import (
|
|||
// Client is the interface of problem client
|
||||
type Client interface {
|
||||
// GetConditions get all specific conditions of current node.
|
||||
GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
// SetConditions set or update conditions of current node.
|
||||
SetConditions(ctx context.Context, conditionTypes []v1.NodeCondition) error
|
||||
SetConditions(conditions []v1.NodeCondition) error
|
||||
// Eventf reports the event.
|
||||
Eventf(eventType string, source, reason, messageFmt string, args ...interface{})
|
||||
// GetNode returns the Node object of the node on which the
|
||||
// node-problem-detector runs.
|
||||
GetNode(ctx context.Context) (*v1.Node, error)
|
||||
GetNode() (*v1.Node, error)
|
||||
}
|
||||
|
||||
type nodeProblemClient struct {
|
||||
|
@ -68,14 +68,13 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
// we have checked it is a valid URI after command line argument is parsed.:)
|
||||
uri, _ := url.Parse(npdo.ApiServerOverride)
|
||||
|
||||
cfg, err := getKubeClientConfig(uri)
|
||||
cfg, err := kubernetes.GetKubeClientConfig(uri)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version())
|
||||
cfg.QPS = npdo.QPS
|
||||
cfg.Burst = npdo.Burst
|
||||
// TODO(random-liu): Set QPS Limit
|
||||
c.client = clientset.NewForConfigOrDie(cfg).CoreV1()
|
||||
c.nodeName = npdo.NodeName
|
||||
c.eventNamespace = npdo.EventNamespace
|
||||
|
@ -84,8 +83,8 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
return c
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode(ctx)
|
||||
func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -100,7 +99,7 @@ func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []
|
|||
return conditions, nil
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v1.NodeCondition) error {
|
||||
func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) error {
|
||||
for i := range newConditions {
|
||||
// Each time we update the conditions, we update the heart beat time
|
||||
newConditions[i].LastHeartbeatTime = metav1.NewTime(c.clock.Now())
|
||||
|
@ -109,15 +108,7 @@ func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return retry.OnError(retry.DefaultRetry,
|
||||
func(error) bool {
|
||||
return true
|
||||
},
|
||||
func() error {
|
||||
_, err := c.client.Nodes().PatchStatus(ctx, c.nodeName, patch)
|
||||
return err
|
||||
},
|
||||
)
|
||||
return c.client.RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(c.nodeName).SubResource("status").Body(patch).Do().Error()
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, args ...interface{}) {
|
||||
|
@ -130,10 +121,8 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string,
|
|||
recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...)
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
// To reduce the load on APIServer & etcd, we are serving GET operations from
|
||||
// apiserver cache (the data might be slightly delayed).
|
||||
return c.client.Nodes().Get(ctx, c.nodeName, metav1.GetOptions{ResourceVersion: "0"})
|
||||
func (c *nodeProblemClient) GetNode() (*v1.Node, error) {
|
||||
return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{})
|
||||
}
|
||||
|
||||
// generatePatch generates condition patch
|
||||
|
@ -148,8 +137,8 @@ func generatePatch(conditions []v1.NodeCondition) ([]byte, error) {
|
|||
// getEventRecorder generates a recorder for specific node name and source.
|
||||
func getEventRecorder(c typedcorev1.CoreV1Interface, namespace, nodeName, source string) record.EventRecorder {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
eventBroadcaster.StartLogging(klog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(runtime.NewScheme(), v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartLogging(glog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events(namespace)})
|
||||
return recorder
|
||||
}
|
||||
|
|
|
@ -22,10 +22,10 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/client-go/tools/record"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
@ -40,7 +40,7 @@ func newFakeProblemClient() *nodeProblemClient {
|
|||
nodeName: testNode,
|
||||
// There is no proper fake for *client.Client for now
|
||||
// TODO(random-liu): Add test for SetConditions when we have good fake for *client.Client
|
||||
clock: testclock.NewFakeClock(time.Now()),
|
||||
clock: &clock.FakeClock{},
|
||||
recorders: make(map[string]record.EventRecorder),
|
||||
nodeRef: getNodeRef("", testNode),
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import (
|
|||
"strconv"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/prometheus"
|
||||
"github.com/golang/glog"
|
||||
"go.opencensus.io/stats/view"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -40,13 +40,13 @@ func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
|||
addr := net.JoinHostPort(npdo.PrometheusServerAddress, strconv.Itoa(npdo.PrometheusServerPort))
|
||||
pe, err := prometheus.NewExporter(prometheus.Options{})
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
glog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
}
|
||||
go func() {
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", pe)
|
||||
if err := http.ListenAndServe(addr, mux); err != nil {
|
||||
klog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
glog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
}
|
||||
}()
|
||||
view.RegisterExporter(pe)
|
||||
|
|
|
@ -18,7 +18,7 @@ package gce
|
|||
|
||||
import (
|
||||
"cloud.google.com/go/compute/metadata"
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
type Metadata struct {
|
||||
|
@ -37,7 +37,7 @@ func (md *Metadata) HasMissingField() bool {
|
|||
|
||||
func (md *Metadata) PopulateFromGCE() error {
|
||||
var err error
|
||||
klog.Info("Fetching GCE metadata from metadata server")
|
||||
glog.Info("Fetching GCE metadata from metadata server")
|
||||
if md.ProjectID == "" {
|
||||
md.ProjectID, err = metadata.ProjectID()
|
||||
if err != nil {
|
||||
|
|
|
@ -18,19 +18,19 @@ package stackdriverexporter
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/stackdriver"
|
||||
monitoredres "contrib.go.opencensus.io/exporter/stackdriver/monitoredresource"
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"go.opencensus.io/stats/view"
|
||||
"google.golang.org/api/option"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"github.com/avast/retry-go/v4"
|
||||
"github.com/avast/retry-go"
|
||||
"k8s.io/node-problem-detector/pkg/exporters"
|
||||
seconfig "k8s.io/node-problem-detector/pkg/exporters/stackdriver/config"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -54,7 +54,6 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
|||
metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m",
|
||||
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
|
||||
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
|
||||
metrics.DiskPercentUsedID: "compute.googleapis.com/guest/disk/percent_used",
|
||||
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
|
||||
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
|
||||
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
|
||||
|
@ -67,7 +66,6 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
|||
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
|
||||
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
|
||||
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
|
||||
metrics.MemoryPercentUsedID: "compute.googleapis.com/guest/memory/percent_used",
|
||||
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
|
||||
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
|
||||
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
|
||||
|
@ -139,12 +137,12 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
DefaultMonitoringLabels: &globalLabels,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
glog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
}
|
||||
|
||||
exportPeriod, err := time.ParseDuration(se.config.ExportPeriod)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
glog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
}
|
||||
|
||||
view.SetReportingPeriod(exportPeriod)
|
||||
|
@ -153,33 +151,33 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
|
||||
func (se *stackdriverExporter) populateMetadataOrDie() {
|
||||
if !se.config.GCEMetadata.HasMissingField() {
|
||||
klog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
glog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
|
||||
metadataFetchTimeout, err := time.ParseDuration(se.config.MetadataFetchTimeout)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
glog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
}
|
||||
|
||||
metadataFetchInterval, err := time.ParseDuration(se.config.MetadataFetchInterval)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
glog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
}
|
||||
|
||||
klog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
glog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
err = retry.Do(se.config.GCEMetadata.PopulateFromGCE,
|
||||
retry.Delay(metadataFetchInterval),
|
||||
retry.Attempts(uint(metadataFetchTimeout/metadataFetchInterval)),
|
||||
retry.DelayType(retry.FixedDelay))
|
||||
if err == nil {
|
||||
klog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
glog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
if se.config.PanicOnMetadataFetchFailure {
|
||||
klog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
glog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
} else {
|
||||
klog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
glog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -202,7 +200,7 @@ func (clo *commandLineOptions) SetFlags(fs *pflag.FlagSet) {
|
|||
func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
||||
options, ok := clo.(*commandLineOptions)
|
||||
if !ok {
|
||||
klog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
glog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
}
|
||||
if options.configPath == "" {
|
||||
return nil
|
||||
|
@ -211,17 +209,17 @@ func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
|||
se := stackdriverExporter{}
|
||||
|
||||
// Apply configurations.
|
||||
f, err := os.ReadFile(options.configPath)
|
||||
f, err := ioutil.ReadFile(options.configPath)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &se.config)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
se.config.ApplyConfiguration()
|
||||
|
||||
klog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
glog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
|
||||
se.populateMetadataOrDie()
|
||||
se.setupOpenCensusViewExporterOrDie()
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,13 +17,9 @@ limitations under the License.
|
|||
package healthchecker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
@ -40,7 +36,6 @@ type healthChecker struct {
|
|||
crictlPath string
|
||||
healthCheckTimeout time.Duration
|
||||
coolDownTime time.Duration
|
||||
loopBackTime time.Duration
|
||||
logPatternsToCheck map[string]int
|
||||
}
|
||||
|
||||
|
@ -53,7 +48,6 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
|
|||
healthCheckTimeout: hco.HealthCheckTimeout,
|
||||
coolDownTime: hco.CoolDownTime,
|
||||
service: hco.Service,
|
||||
loopBackTime: hco.LoopBackTime,
|
||||
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
|
||||
}
|
||||
hc.healthCheckFunc = getHealthCheckFunc(hco)
|
||||
|
@ -69,26 +63,24 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
|
|||
if err != nil {
|
||||
return healthy, err
|
||||
}
|
||||
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.loopBackTime, hc.logPatternsToCheck)
|
||||
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
|
||||
if err != nil {
|
||||
return logPatternHealthy, err
|
||||
}
|
||||
if healthy && logPatternHealthy {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// The service is unhealthy.
|
||||
// Attempt repair based on flag.
|
||||
if hc.enableRepair {
|
||||
// repair if the service has been up for the cool down period.
|
||||
uptime, err := hc.uptimeFunc()
|
||||
if err != nil {
|
||||
klog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
return false, nil
|
||||
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
}
|
||||
klog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
if uptime > hc.coolDownTime {
|
||||
klog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
hc.repairFunc()
|
||||
}
|
||||
}
|
||||
|
@ -97,21 +89,18 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
|
|||
|
||||
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
|
||||
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
|
||||
func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatternsToCheck map[string]int) (bool, error) {
|
||||
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
|
||||
if len(logPatternsToCheck) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
uptimeFunc := getUptimeFunc(service)
|
||||
klog.Infof("Getting uptime for service: %v\n", service)
|
||||
uptime, err := uptimeFunc()
|
||||
if err != nil {
|
||||
klog.Warningf("Failed to get the uptime: %+v", err)
|
||||
return true, err
|
||||
}
|
||||
|
||||
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
|
||||
if loopBackTime > 0 && uptime > loopBackTime {
|
||||
logStartTime = time.Now().Add(-loopBackTime).Format(types.LogParsingTimeLayout)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
for pattern, count := range logPatternsToCheck {
|
||||
healthy, err := checkForPattern(service, logStartTime, pattern, count)
|
||||
|
@ -121,65 +110,3 @@ func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatter
|
|||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
|
||||
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: timeout}
|
||||
response, err := httpClient.Get(endpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.KubeProxyComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, getDockerPath(), "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
_, err := execCommand(
|
||||
hco.HealthCheckTimeout,
|
||||
hco.CriCtlPath,
|
||||
"--timeout="+hco.CriTimeout.String(),
|
||||
"--runtime-endpoint="+hco.CriSocketPath,
|
||||
"pods",
|
||||
"--latest",
|
||||
)
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
default:
|
||||
klog.Warningf("Unsupported component: %v", hco.Component)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
klog.Infof("command %v failed: %v, %s\n", cmd, err, string(out))
|
||||
return "", err
|
||||
}
|
||||
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
)
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
klog.Fatalf("getUptimeFunc is not supported in %s", runtime.GOOS)
|
||||
return func() (time.Duration, error) { return time.Second, nil }
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
klog.Fatalf("getRepairFunc is not supported in %s", runtime.GOOS)
|
||||
return func() {}
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
klog.Fatalf("checkForPattern is not supported in %s", runtime.GOOS)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
klog.Fatalf("getDockerPath is not supported in %s", runtime.GOOS)
|
||||
return ""
|
||||
}
|
|
@ -17,12 +17,15 @@ limitations under the License.
|
|||
package healthchecker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
|
@ -56,11 +59,6 @@ func getUptimeFunc(service string) func() (time.Duration, error) {
|
|||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
// Use `systemctl kill` instead of `systemctl restart` for the repair function.
|
||||
// We start to rely on the kernel message difference for the two commands to
|
||||
// indicate if the component restart is due to an administrative plan (restart)
|
||||
// or a system issue that needs repair (kill).
|
||||
// See https://github.com/kubernetes/node-problem-detector/issues/847.
|
||||
switch hco.Component {
|
||||
case types.DockerComponent:
|
||||
// Use "docker ps" for docker health check. Not using crictl for docker to remove
|
||||
|
@ -77,6 +75,49 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
|||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
|
@ -95,12 +136,8 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
|
|||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker"
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
|
@ -120,38 +119,3 @@ func TestHealthCheck(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestComponentsSupported(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
description string
|
||||
component string
|
||||
}{
|
||||
{
|
||||
description: "Kube Proxy should be supported",
|
||||
component: types.KubeProxyComponent,
|
||||
},
|
||||
{
|
||||
description: "Kubelet should be supported",
|
||||
component: types.KubeletComponent,
|
||||
},
|
||||
{
|
||||
description: "Docker should be supported",
|
||||
component: types.DockerComponent,
|
||||
},
|
||||
{
|
||||
description: "CRI should be supported",
|
||||
component: types.CRIComponent,
|
||||
},
|
||||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
checkFunc := getHealthCheckFunc(&options.HealthCheckerOptions{
|
||||
Component: tc.component,
|
||||
})
|
||||
if checkFunc == nil {
|
||||
t.Errorf("component %v should be supported", tc.component)
|
||||
}
|
||||
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,12 +18,13 @@ package healthchecker
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
|
@ -33,19 +34,12 @@ import (
|
|||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
// To attempt to calculate uptime more efficiently, we attempt to grab the process id to grab the start time.
|
||||
// If the process id does not exist (meaning the service is not running for some reason), we will result to
|
||||
// using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
|
||||
// In addition to filtering not by the logname=system we also filter on event id=7036 to reduce the number of
|
||||
// entries the next command Where-Object will have to look through. id 7036 messages indicating a stopped or running service.
|
||||
// Using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
|
||||
// The powershell command formats the TimeCreated of the event log in RFC1123Pattern.
|
||||
// However, because the time library parser does not recognize the ',' in this RFC1123Pattern format,
|
||||
// it is manually removed before parsing it using the UptimeTimeLayout.
|
||||
getTimeCreatedCmd := `$ProcessId = (Get-WMIObject -Class Win32_Service -Filter "Name='` + service + `'" | Select-Object -ExpandProperty ProcessId);` +
|
||||
`if ([string]::IsNullOrEmpty($ProcessId) -or $ProcessId -eq 0) { (Get-WinEvent -FilterHashtable @{logname='system';id=7036} ` +
|
||||
`| Where-Object {$_.Message -match '.*(` + service + `).*(running).*'} | Select-Object -Property TimeCreated -First 1 | ` +
|
||||
`foreach {$_.TimeCreated.ToUniversalTime().ToString('R')} | Out-String).Trim() } else { (Get-Process -Id $ProcessId | Select starttime | ` +
|
||||
`foreach {$_.starttime.ToUniversalTime().ToString('R')} | Out-String).Trim() }`
|
||||
getTimeCreatedCmd := "(Get-WinEvent -Logname System | Where-Object {$_.Message -Match '.*(" + service +
|
||||
").*(running).*'} | Select-Object -Property TimeCreated -First 1 | foreach {$_.TimeCreated.ToString('R')} | Out-String).Trim()"
|
||||
out, err := powershell(getTimeCreatedCmd)
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
|
@ -70,6 +64,49 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
|||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint, hco.HealthCheckTimeout)
|
||||
case types.KubeProxyComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint, hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand("docker.exe", "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
|
||||
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: timeout}
|
||||
response, err := httpClient.Get(endpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// execCommand creates a new process, executes the command, and returns the (output, error) from command.
|
||||
func execCommand(command string, args ...string) (string, error) {
|
||||
cmd := util.Exec(command, args...)
|
||||
return extractCommandOutput(cmd)
|
||||
}
|
||||
|
||||
// powershell executes the arguments in powershell process and returns (output, error) from command.
|
||||
func powershell(args ...string) (string, error) {
|
||||
cmd := util.Powershell(args...)
|
||||
|
@ -80,7 +117,7 @@ func powershell(args ...string) (string, error) {
|
|||
func extractCommandOutput(cmd *exec.Cmd) (string, error) {
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
klog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\r\n"), nil
|
||||
|
@ -101,12 +138,8 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
|
|||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker.exe"
|
||||
}
|
||||
|
|
|
@ -18,8 +18,6 @@ package types
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
@ -27,8 +25,6 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
DefaultLoopBackTime = 0 * time.Minute
|
||||
DefaultCriTimeout = 2 * time.Second
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
|
@ -40,57 +36,12 @@ const (
|
|||
ContainerdService = "containerd"
|
||||
KubeProxyComponent = "kube-proxy"
|
||||
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
KubeProxyHealthCheckEndpoint = "http://127.0.0.1:10256/healthz"
|
||||
|
||||
LogPatternFlagSeparator = ":"
|
||||
hostAddressKey = "HOST_ADDRESS"
|
||||
kubeletPortKey = "KUBELET_PORT"
|
||||
kubeProxyPortKey = "KUBEPROXY_PORT"
|
||||
|
||||
defaultHostAddress = "localhost"
|
||||
defaultKubeletPort = "10248"
|
||||
defaultKubeproxyPort = "10256"
|
||||
)
|
||||
|
||||
var (
|
||||
kubeletHealthCheckEndpoint string
|
||||
kubeProxyHealthCheckEndpoint string
|
||||
)
|
||||
|
||||
func init() {
|
||||
setKubeEndpoints()
|
||||
}
|
||||
|
||||
func setKubeEndpoints() {
|
||||
var o string
|
||||
|
||||
hostAddress := defaultHostAddress
|
||||
kubeletPort := defaultKubeletPort
|
||||
kubeProxyPort := defaultKubeproxyPort
|
||||
|
||||
o = os.Getenv(hostAddressKey)
|
||||
if o != "" {
|
||||
hostAddress = o
|
||||
}
|
||||
o = os.Getenv(kubeletPortKey)
|
||||
if o != "" {
|
||||
kubeletPort = o
|
||||
}
|
||||
o = os.Getenv(kubeProxyPortKey)
|
||||
if o != "" {
|
||||
kubeProxyPort = o
|
||||
}
|
||||
|
||||
kubeletHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeletPort))
|
||||
kubeProxyHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeProxyPort))
|
||||
|
||||
}
|
||||
|
||||
func KubeProxyHealthCheckEndpoint() string {
|
||||
return kubeProxyHealthCheckEndpoint
|
||||
}
|
||||
func KubeletHealthCheckEndpoint() string {
|
||||
return kubeletHealthCheckEndpoint
|
||||
}
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() (bool, error)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
|
||||
)
|
|
@ -98,101 +98,3 @@ func TestLogPatternFlag(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestKubeEndpointConfiguration(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
envConfig map[string]string
|
||||
expectedKubeletEndpoint string
|
||||
expectedKubeProxyEndpoint string
|
||||
}{
|
||||
{
|
||||
name: "no overrides supplied",
|
||||
envConfig: map[string]string{},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv4",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.5.4",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.5.4:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.5.4:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv6",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "80:f4:16::1",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://[80:f4:16::1]:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://[80:f4:16::1]:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS, KUBELET_PORT and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.10.1",
|
||||
"KUBELET_PORT": "12345",
|
||||
"KUBEPROXY_PORT": "12346",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.10.1:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.10.1:12346/healthz",
|
||||
},
|
||||
}
|
||||
for _, test := range testCases {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
for key, val := range test.envConfig {
|
||||
t.Setenv(key, val)
|
||||
}
|
||||
setKubeEndpoints()
|
||||
|
||||
kubeProxyHCEndpoint := KubeProxyHealthCheckEndpoint()
|
||||
kubeletHCEndpoint := KubeletHealthCheckEndpoint()
|
||||
|
||||
assert.Equal(t, test.expectedKubeProxyEndpoint, kubeProxyHCEndpoint)
|
||||
assert.Equal(t, test.expectedKubeletEndpoint, kubeletHCEndpoint)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
|
||||
)
|
|
@ -17,7 +17,7 @@ limitations under the License.
|
|||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "C:/etc/kubernetes/node/bin/crictl.exe"
|
||||
DefaultCriCtl = "C:/node/crictl.exe"
|
||||
DefaultCriSocketPath = "npipe:////./pipe/containerd-containerd"
|
||||
UptimeTimeLayout = "Mon 02 Jan 2006 15:04:05 MST"
|
||||
LogParsingTimeFormat = "yyyy-MM-dd HH:mm:ss"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -23,7 +22,7 @@ import (
|
|||
"fmt"
|
||||
"time"
|
||||
|
||||
"k8s.io/utils/clock"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/logcounter/options"
|
||||
"k8s.io/node-problem-detector/pkg/logcounter/types"
|
||||
|
@ -40,11 +39,10 @@ const (
|
|||
)
|
||||
|
||||
type logCounter struct {
|
||||
logCh <-chan *systemtypes.Log
|
||||
buffer systemlogmonitor.LogBuffer
|
||||
pattern string
|
||||
revertPattern string
|
||||
clock clock.Clock
|
||||
logCh <-chan *systemtypes.Log
|
||||
buffer systemlogmonitor.LogBuffer
|
||||
pattern string
|
||||
clock clock.Clock
|
||||
}
|
||||
|
||||
func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
|
||||
|
@ -60,11 +58,10 @@ func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter
|
|||
return nil, fmt.Errorf("error watching journald: %v", err)
|
||||
}
|
||||
return &logCounter{
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
pattern: options.Pattern,
|
||||
revertPattern: options.RevertPattern,
|
||||
clock: clock.RealClock{},
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
pattern: options.Pattern,
|
||||
clock: clock.RealClock{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -86,9 +83,6 @@ func (e *logCounter) Count() (count int, err error) {
|
|||
if len(e.buffer.Match(e.pattern)) != 0 {
|
||||
count++
|
||||
}
|
||||
if e.revertPattern != "" && len(e.buffer.Match(e.revertPattern)) != 0 {
|
||||
count--
|
||||
}
|
||||
case <-e.clock.After(timeout):
|
||||
// Don't block forever if we do not get any new messages
|
||||
return
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -23,16 +22,16 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/logcounter/types"
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
|
||||
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
)
|
||||
|
||||
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *testclock.FakeClock, chan *systemtypes.Log) {
|
||||
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *clock.FakeClock, chan *systemtypes.Log) {
|
||||
logCh := make(chan *systemtypes.Log)
|
||||
clock := testclock.NewFakeClock(startTime)
|
||||
clock := clock.NewFakeClock(startTime)
|
||||
return &logCounter{
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
|
|
|
@ -19,7 +19,7 @@ package problemdaemon
|
|||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
@ -58,7 +58,7 @@ func NewProblemDaemons(monitorConfigPaths types.ProblemDaemonConfigPathMap) []ty
|
|||
for _, config := range *configs {
|
||||
if _, ok := problemDaemonMap[config]; ok {
|
||||
// Skip the config if it's duplicated.
|
||||
klog.Warningf("Duplicated problem daemon configuration %q", config)
|
||||
glog.Warningf("Duplicated problem daemon configuration %q", config)
|
||||
continue
|
||||
}
|
||||
problemDaemonMap[config] = handlers[problemDaemonType].CreateProblemDaemonOrDie(config)
|
||||
|
|
|
@ -17,17 +17,16 @@ limitations under the License.
|
|||
package problemdetector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
||||
// ProblemDetector collects statuses from all problem daemons and update the node condition and send node event.
|
||||
type ProblemDetector interface {
|
||||
Run(context.Context) error
|
||||
Run(termCh <-chan error) error
|
||||
}
|
||||
|
||||
type problemDetector struct {
|
||||
|
@ -45,7 +44,7 @@ func NewProblemDetector(monitors []types.Monitor, exporters []types.Exporter) Pr
|
|||
}
|
||||
|
||||
// Run starts the problem detector.
|
||||
func (p *problemDetector) Run(ctx context.Context) error {
|
||||
func (p *problemDetector) Run(termCh <-chan error) error {
|
||||
// Start the log monitors one by one.
|
||||
var chans []<-chan *types.Status
|
||||
failureCount := 0
|
||||
|
@ -53,7 +52,7 @@ func (p *problemDetector) Run(ctx context.Context) error {
|
|||
ch, err := m.Start()
|
||||
if err != nil {
|
||||
// Do not return error and keep on trying the following config files.
|
||||
klog.Errorf("Failed to start problem daemon %v: %v", m, err)
|
||||
glog.Errorf("Failed to start problem daemon %v: %v", m, err)
|
||||
failureCount++
|
||||
continue
|
||||
}
|
||||
|
@ -74,11 +73,11 @@ func (p *problemDetector) Run(ctx context.Context) error {
|
|||
}()
|
||||
|
||||
ch := groupChannel(chans)
|
||||
klog.Info("Problem detector started")
|
||||
glog.Info("Problem detector started")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-termCh:
|
||||
return nil
|
||||
case status := <-ch:
|
||||
for _, exporter := range p.exporters {
|
||||
|
|
|
@ -17,7 +17,6 @@ limitations under the License.
|
|||
package problemdetector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -25,7 +24,7 @@ import (
|
|||
|
||||
func TestEmpty(t *testing.T) {
|
||||
pd := NewProblemDetector([]types.Monitor{}, []types.Exporter{})
|
||||
if err := pd.Run(context.Background()); err == nil {
|
||||
if err := pd.Run(nil); err == nil {
|
||||
t.Error("expected error when running an empty problem detector")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ import (
|
|||
"fmt"
|
||||
"sync"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
)
|
||||
|
@ -56,7 +56,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
|
|||
metrics.Sum,
|
||||
[]string{"reason"})
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to create problem_counter metric: %v", err)
|
||||
glog.Fatalf("Failed to create problem_counter metric: %v", err)
|
||||
}
|
||||
|
||||
pmm.problemGauge, err = metrics.NewInt64Metric(
|
||||
|
@ -67,7 +67,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
|
|||
metrics.LastValue,
|
||||
[]string{"type", "reason"})
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to create problem_gauge metric: %v", err)
|
||||
glog.Fatalf("Failed to create problem_gauge metric: %v", err)
|
||||
}
|
||||
|
||||
pmm.problemTypeToReason = make(map[string]string)
|
||||
|
|
|
@ -37,8 +37,7 @@ with new rule definition:
|
|||
"type": "temporary/permanent",
|
||||
"condition": "NodeConditionOfPermanentIssue",
|
||||
"reason": "CamelCaseShortReason",
|
||||
"pattern": "regexp matching the issue in the log",
|
||||
"patternGeneratedMessageSuffix": "Please check the network connectivity and ensure that all required services are running. For more details, see our documentation at https://example.com/docs/troubleshooting."
|
||||
"message": "regexp matching the issue in the log"
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ type MonitorConfig struct {
|
|||
EnableMetricsReporting *bool `json:"metricsReporting,omitempty"`
|
||||
}
|
||||
|
||||
// ApplyDefaultConfiguration applies default configurations.
|
||||
// ApplyConfiguration applies default configurations.
|
||||
func (mc *MonitorConfig) ApplyDefaultConfiguration() {
|
||||
if mc.BufferSize == 0 {
|
||||
mc.BufferSize = defaultBufferSize
|
||||
|
|
|
@ -18,16 +18,16 @@ package systemlogmonitor
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"io/ioutil"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/problemdaemon"
|
||||
"k8s.io/node-problem-detector/pkg/problemmetrics"
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers"
|
||||
watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
|
@ -50,7 +50,7 @@ type logMonitor struct {
|
|||
buffer LogBuffer
|
||||
config MonitorConfig
|
||||
conditions []types.Condition
|
||||
logCh <-chan *systemlogtypes.Log
|
||||
logCh <-chan *logtypes.Log
|
||||
output chan *types.Status
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
@ -62,21 +62,21 @@ func NewLogMonitorOrDie(configPath string) types.Monitor {
|
|||
tomb: tomb.NewTomb(),
|
||||
}
|
||||
|
||||
f, err := os.ReadFile(configPath)
|
||||
f, err := ioutil.ReadFile(configPath)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &l.config)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
}
|
||||
// Apply default configurations
|
||||
(&l.config).ApplyDefaultConfiguration()
|
||||
err = l.config.ValidateRules()
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
|
||||
glog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
|
||||
}
|
||||
klog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
|
||||
glog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
|
||||
|
||||
l.watcher = logwatchers.GetLogWatcherOrDie(l.config.WatcherConfig)
|
||||
l.buffer = NewLogBuffer(l.config.BufferSize)
|
||||
|
@ -96,19 +96,19 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) {
|
|||
if rule.Type == types.Perm {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
rule.Condition, rule.Reason, err)
|
||||
}
|
||||
}
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *logMonitor) Start() (<-chan *types.Status, error) {
|
||||
klog.Infof("Start log monitor %s", l.configPath)
|
||||
glog.Infof("Start log monitor %s", l.configPath)
|
||||
var err error
|
||||
l.logCh, err = l.watcher.Watch()
|
||||
if err != nil {
|
||||
|
@ -119,7 +119,7 @@ func (l *logMonitor) Start() (<-chan *types.Status, error) {
|
|||
}
|
||||
|
||||
func (l *logMonitor) Stop() {
|
||||
klog.Infof("Stop log monitor %s", l.configPath)
|
||||
glog.Infof("Stop log monitor %s", l.configPath)
|
||||
l.tomb.Stop()
|
||||
}
|
||||
|
||||
|
@ -134,20 +134,20 @@ func (l *logMonitor) monitorLoop() {
|
|||
select {
|
||||
case log, ok := <-l.logCh:
|
||||
if !ok {
|
||||
klog.Errorf("Log channel closed: %s", l.configPath)
|
||||
glog.Errorf("Log channel closed: %s", l.configPath)
|
||||
return
|
||||
}
|
||||
l.parseLog(log)
|
||||
case <-l.tomb.Stopping():
|
||||
l.watcher.Stop()
|
||||
klog.Infof("Log monitor stopped: %s", l.configPath)
|
||||
glog.Infof("Log monitor stopped: %s", l.configPath)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseLog parses one log line.
|
||||
func (l *logMonitor) parseLog(log *systemlogtypes.Log) {
|
||||
func (l *logMonitor) parseLog(log *logtypes.Log) {
|
||||
// Once there is new log, log monitor will push it into the log buffer and try
|
||||
// to match each rule. If any rule is matched, log monitor will report a status.
|
||||
l.buffer.Push(log)
|
||||
|
@ -157,16 +157,16 @@ func (l *logMonitor) parseLog(log *systemlogtypes.Log) {
|
|||
continue
|
||||
}
|
||||
status := l.generateStatus(matched, rule)
|
||||
klog.Infof("New status generated: %+v", status)
|
||||
glog.Infof("New status generated: %+v", status)
|
||||
l.output <- status
|
||||
}
|
||||
}
|
||||
|
||||
// generateStatus generates status from the logs.
|
||||
func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogtypes.Rule) *types.Status {
|
||||
func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Rule) *types.Status {
|
||||
// We use the timestamp of the first log line as the timestamp of the status.
|
||||
timestamp := logs[0].Timestamp
|
||||
message := generateMessage(logs, rule.PatternGeneratedMessageSuffix)
|
||||
message := generateMessage(logs)
|
||||
var events []types.Event
|
||||
var changedConditions []*types.Condition
|
||||
if rule.Type == types.Temp {
|
||||
|
@ -192,7 +192,6 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
|
|||
condition.Type,
|
||||
types.True,
|
||||
rule.Reason,
|
||||
message,
|
||||
timestamp,
|
||||
))
|
||||
}
|
||||
|
@ -208,14 +207,14 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
|
|||
for _, event := range events {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(event.Reason, 1)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
|
||||
glog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
|
||||
}
|
||||
}
|
||||
for _, condition := range changedConditions {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
|
||||
condition.Type, condition.Reason, condition.Status == types.True)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
condition.Type, condition.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -233,7 +232,7 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
|
|||
func (l *logMonitor) initializeStatus() {
|
||||
// Initialize the default node conditions
|
||||
l.conditions = initialConditions(l.config.DefaultConditions)
|
||||
klog.Infof("Initialize condition generated: %+v", l.conditions)
|
||||
glog.Infof("Initialize condition generated: %+v", l.conditions)
|
||||
// Update the initial status
|
||||
l.output <- &types.Status{
|
||||
Source: l.config.Source,
|
||||
|
@ -251,14 +250,10 @@ func initialConditions(defaults []types.Condition) []types.Condition {
|
|||
return conditions
|
||||
}
|
||||
|
||||
func generateMessage(logs []*systemlogtypes.Log, patternGeneratedMessageSuffix string) string {
|
||||
func generateMessage(logs []*logtypes.Log) string {
|
||||
messages := []string{}
|
||||
for _, log := range logs {
|
||||
messages = append(messages, log.Message)
|
||||
}
|
||||
logMessage := concatLogs(messages)
|
||||
if patternGeneratedMessageSuffix != "" {
|
||||
return fmt.Sprintf("%s; %s", logMessage, patternGeneratedMessageSuffix)
|
||||
}
|
||||
return logMessage
|
||||
return concatLogs(messages)
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/problemdaemon"
|
||||
"k8s.io/node-problem-detector/pkg/problemmetrics"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
|
@ -85,7 +84,6 @@ func TestGenerateStatusForConditions(t *testing.T) {
|
|||
testConditionA,
|
||||
types.True,
|
||||
"test reason",
|
||||
"test message 1\ntest message 2",
|
||||
time.Unix(1000, 1000),
|
||||
)},
|
||||
Conditions: []types.Condition{
|
||||
|
@ -700,40 +698,3 @@ func TestInitializeProblemMetricsOrDie(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateMessage(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
logs []*systemlogtypes.Log
|
||||
patternGeneratedMessageSuffix string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "No rule message",
|
||||
logs: []*systemlogtypes.Log{
|
||||
{Message: "First log message"},
|
||||
{Message: "Second log message"},
|
||||
},
|
||||
patternGeneratedMessageSuffix: "",
|
||||
want: "First log message\nSecond log message",
|
||||
},
|
||||
{
|
||||
name: "With rule message",
|
||||
logs: []*systemlogtypes.Log{
|
||||
{Message: "First log message"},
|
||||
{Message: "Second log message"},
|
||||
},
|
||||
patternGeneratedMessageSuffix: "refer www.foo.com/docs for playbook on how to fix the issue",
|
||||
want: "First log message\nSecond log message; refer www.foo.com/docs for playbook on how to fix the issue",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := generateMessage(tt.logs, tt.patternGeneratedMessageSuffix)
|
||||
if got != tt.want {
|
||||
t.Errorf("generateMessage() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,7 +23,8 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
utilclock "code.cloudfoundry.org/clock"
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
|
@ -39,6 +40,7 @@ type filelogWatcher struct {
|
|||
logCh chan *logtypes.Log
|
||||
startTime time.Time
|
||||
tomb *tomb.Tomb
|
||||
clock utilclock.Clock
|
||||
}
|
||||
|
||||
// NewSyslogWatcherOrDie creates a new log watcher. The function panics
|
||||
|
@ -46,11 +48,11 @@ type filelogWatcher struct {
|
|||
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
|
||||
uptime, err := util.GetUptimeDuration()
|
||||
if err != nil {
|
||||
klog.Fatalf("failed to get uptime: %v", err)
|
||||
glog.Fatalf("failed to get uptime: %v", err)
|
||||
}
|
||||
startTime, err := util.GetStartTime(time.Now(), uptime, cfg.Lookback, cfg.Delay)
|
||||
if err != nil {
|
||||
klog.Fatalf("failed to get start time: %v", err)
|
||||
glog.Fatalf("failed to get start time: %v", err)
|
||||
}
|
||||
|
||||
return &filelogWatcher{
|
||||
|
@ -60,6 +62,7 @@ func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
|
|||
tomb: tomb.NewTomb(),
|
||||
// A capacity 1000 buffer should be enough
|
||||
logCh: make(chan *logtypes.Log, 1000),
|
||||
clock: utilclock.NewClock(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -74,7 +77,7 @@ func (s *filelogWatcher) Watch() (<-chan *logtypes.Log, error) {
|
|||
}
|
||||
s.reader = bufio.NewReader(r)
|
||||
s.closer = r
|
||||
klog.Info("Start watching filelog")
|
||||
glog.Info("Start watching filelog")
|
||||
go s.watchLoop()
|
||||
return s.logCh, nil
|
||||
}
|
||||
|
@ -99,14 +102,14 @@ func (s *filelogWatcher) watchLoop() {
|
|||
for {
|
||||
select {
|
||||
case <-s.tomb.Stopping():
|
||||
klog.Infof("Stop watching filelog")
|
||||
glog.Infof("Stop watching filelog")
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
line, err := s.reader.ReadString('\n')
|
||||
if err != nil && err != io.EOF {
|
||||
klog.Errorf("Exiting filelog watch with error: %v", err)
|
||||
glog.Errorf("Exiting filelog watch with error: %v", err)
|
||||
return
|
||||
}
|
||||
buffer.WriteString(line)
|
||||
|
@ -116,28 +119,16 @@ func (s *filelogWatcher) watchLoop() {
|
|||
}
|
||||
line = buffer.String()
|
||||
buffer.Reset()
|
||||
if s.filterSkipList(line) {
|
||||
continue
|
||||
}
|
||||
log, err := s.translator.translate(strings.TrimSuffix(line, "\n"))
|
||||
if err != nil {
|
||||
klog.Warningf("Unable to parse line: %q, %v", line, err)
|
||||
glog.Warningf("Unable to parse line: %q, %v", line, err)
|
||||
continue
|
||||
}
|
||||
// Discard messages before start time.
|
||||
if log.Timestamp.Before(s.startTime) {
|
||||
klog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
|
||||
glog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
|
||||
continue
|
||||
}
|
||||
s.logCh <- log
|
||||
}
|
||||
}
|
||||
|
||||
func (s *filelogWatcher) filterSkipList(line string) bool {
|
||||
for _ , skipItem := range s.cfg.SkipList {
|
||||
if strings.Contains(line, skipItem) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package filelog
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/hpcloud/tail"
|
||||
)
|
||||
|
||||
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back
|
||||
// to the rolled out logs.
|
||||
func getLogReader(path string) (io.ReadCloser, error) {
|
||||
return tail.OpenFile(path)
|
||||
}
|
|
@ -19,8 +19,9 @@ package filelog
|
|||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"k8s.io/node-problem-detector/third_party/forked/cadvisor/tail"
|
||||
"os"
|
||||
|
||||
"github.com/google/cadvisor/utils/tail"
|
||||
)
|
||||
|
||||
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package filelog
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -25,8 +26,8 @@ import (
|
|||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
"code.cloudfoundry.org/clock/fakeclock"
|
||||
"github.com/stretchr/testify/assert"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
)
|
||||
|
||||
// getTestPluginConfig returns a plugin config for test. Use configuration for
|
||||
|
@ -42,7 +43,7 @@ func getTestPluginConfig() map[string]string {
|
|||
func TestWatch(t *testing.T) {
|
||||
// now is a fake time
|
||||
now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local)
|
||||
fakeClock := testclock.NewFakeClock(now)
|
||||
fakeClock := fakeclock.NewFakeClock(now)
|
||||
testCases := []struct {
|
||||
uptime time.Duration
|
||||
lookback string
|
||||
|
@ -138,7 +139,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
for c, test := range testCases {
|
||||
t.Logf("TestCase #%d: %#v", c+1, test)
|
||||
f, err := os.CreateTemp("", "log_watcher_test")
|
||||
f, err := ioutil.TempFile("", "log_watcher_test")
|
||||
assert.NoError(t, err)
|
||||
defer func() {
|
||||
f.Close()
|
||||
|
@ -155,6 +156,8 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
})
|
||||
// Set the startTime.
|
||||
w.(*filelogWatcher).startTime, _ = util.GetStartTime(fakeClock.Now(), test.uptime, test.lookback, test.delay)
|
||||
// Set the fake clock.
|
||||
w.(*filelogWatcher).clock = fakeClock
|
||||
logCh, err := w.Watch()
|
||||
assert.NoError(t, err)
|
||||
defer w.Stop()
|
||||
|
@ -167,7 +170,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
}
|
||||
// The log channel should have already been drained
|
||||
// There could still be future messages sent into the channel, but the chance is really slim.
|
||||
// There could stil be future messages sent into the channel, but the chance is really slim.
|
||||
timeout := time.After(100 * time.Millisecond)
|
||||
select {
|
||||
case log := <-logCh:
|
||||
|
@ -176,36 +179,3 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterSkipList(t *testing.T) {
|
||||
s := &filelogWatcher{
|
||||
cfg: types.WatcherConfig{
|
||||
SkipList: []string{
|
||||
" audit:", " kubelet:",
|
||||
},
|
||||
},
|
||||
}
|
||||
testcase := []struct{
|
||||
log string
|
||||
expect bool
|
||||
}{
|
||||
{
|
||||
log: `Jan 2 03:04:03 kernel: [0.000000] 1`,
|
||||
expect: false,
|
||||
},
|
||||
{
|
||||
log: `Jan 2 03:04:04 audit: [1.000000] 2`,
|
||||
expect: true,
|
||||
},
|
||||
{
|
||||
log: `Jan 2 03:04:05 kubelet: [2.000000] 3`,
|
||||
expect: true,
|
||||
|
||||
},
|
||||
}
|
||||
for i, test := range testcase {
|
||||
if s.filterSkipList(test.log) != test.expect {
|
||||
t.Errorf("test case %d: expect %v but got %v", i, test.expect, s.filterSkipList(test.log))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
|
|||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
@ -22,7 +22,7 @@ import (
|
|||
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// translator translates log line into internal log type based on user defined
|
||||
|
@ -46,7 +46,7 @@ const (
|
|||
|
||||
func newTranslatorOrDie(pluginConfig map[string]string) *translator {
|
||||
if err := validatePluginConfig(pluginConfig); err != nil {
|
||||
klog.Errorf("Failed to validate plugin configuration %+v: %v", pluginConfig, err)
|
||||
glog.Errorf("Failed to validate plugin configuration %+v: %v", pluginConfig, err)
|
||||
}
|
||||
return &translator{
|
||||
timestampRegexp: regexp.MustCompile(pluginConfig[timestampKey]),
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue