Compare commits

..

No commits in common. "master" and "v0.8.8" have entirely different histories.

5479 changed files with 606527 additions and 1005507 deletions

View File

@ -1,38 +0,0 @@
version: 2
updates:
- package-ecosystem: github-actions
directory: /
schedule:
interval: weekly
groups:
actions-all:
patterns:
- "*"
labels:
- "ok-to-test"
- package-ecosystem: docker
directory: /
schedule:
interval: weekly
labels:
- "ok-to-test"
- package-ecosystem: gomod
directories:
- /
- /test
schedule:
interval: weekly
ignore:
- dependency-name: "*"
update-types:
- "version-update:semver-major"
- "version-update:semver-minor"
groups:
k8s:
patterns:
- "k8s.io/*"
- "sigs.k8s.io/*"
labels:
- "ok-to-test"

View File

@ -1,78 +0,0 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
on:
push:
branches: ["master"]
pull_request:
# The branches below must be a subset of the branches above
branches: ["master"]
schedule:
- cron: "0 0 * * 1"
permissions:
contents: read
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: ["go"]
# CodeQL supports [ $supported-codeql-languages ]
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
steps:
- name: Harden Runner
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
with:
egress-policy: audit
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
# Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
# If the Autobuild fails above, remove it and uncomment the following three lines.
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
# - run: |
# echo "Run, Build Application using script"
# ./location_of_script_within_repo/buildscript.sh
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
with:
category: "/language:${{matrix.language}}"

View File

@ -1,27 +0,0 @@
# Dependency Review Action
#
# This Action will scan dependency manifest files that change as part of a Pull Request,
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
# Once installed, if the workflow run is marked as required,
# PRs introducing known-vulnerable packages will be blocked from merging.
#
# Source repository: https://github.com/actions/dependency-review-action
name: 'Dependency Review'
on: [pull_request]
permissions:
contents: read
jobs:
dependency-review:
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
with:
egress-policy: audit
- name: 'Checkout Repository'
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 'Dependency Review'
uses: actions/dependency-review-action@da24556b548a50705dd671f47852072ea4c105d9 # v4.7.1

View File

@ -1,76 +0,0 @@
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: '20 7 * * 2'
push:
branches: ["master"]
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
contents: read
actions: read
steps:
- name: Harden Runner
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
with:
egress-policy: audit
- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecards on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: SARIF file
path: results.sarif
retention-days: 5
# Upload the results to GitHub's code scanning dashboard.
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
with:
sarif_file: results.sarif

View File

@ -1,33 +0,0 @@
name: tag-release
on:
push:
branches:
- master
paths:
- version.txt
permissions:
contents: read
jobs:
tag:
if: ${{ github.repository == 'kubernetes/node-problem-detector' }}
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Harden Runner
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
with:
egress-policy: audit
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- run: /usr/bin/git config --global user.email actions@github.com
- run: /usr/bin/git config --global user.name 'GitHub Actions Release Tagger'
- run: hack/tag-release.sh
id: tag_release
outputs:
release_tag: ${{ steps.tag_release.outputs.release_tag }}

2
.gitignore vendored
View File

@ -6,5 +6,3 @@ pr.env
junit*.xml
debug.test
/output/
coverage.out
.idea/

View File

@ -1,18 +0,0 @@
repos:
- repo: https://github.com/gitleaks/gitleaks
rev: v8.16.3
hooks:
- id: gitleaks
- repo: https://github.com/golangci/golangci-lint
rev: v1.52.2
hooks:
- id: golangci-lint
- repo: https://github.com/jumanjihouse/pre-commit-hooks
rev: 3.0.0
hooks:
- id: shellcheck
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace

33
.travis.yml Normal file
View File

@ -0,0 +1,33 @@
os:
- linux
sudo: required
dist: xenial
language: go
go:
- "1.16"
- master
env:
- GO111MODULE=on
services:
- docker
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y libsystemd-dev
install:
- mkdir -p $HOME/gopath/src/k8s.io
- mv $TRAVIS_BUILD_DIR $HOME/gopath/src/k8s.io/node-problem-detector
- cd $HOME/gopath/src/k8s.io/node-problem-detector
script:
- make
- make test
- make clean && BUILD_TAGS="disable_custom_plugin_monitor" make
- BUILD_TAGS="disable_custom_plugin_monitor" make test
- make clean && BUILD_TAGS="disable_system_log_monitor" make
- BUILD_TAGS="disable_system_log_monitor" make test
- make clean && BUILD_TAGS="disable_system_stats_monitor" make
- BUILD_TAGS="disable_system_stats_monitor" make test
- make clean && BUILD_TAGS="disable_stackdriver_exporter" make
- BUILD_TAGS="disable_stackdriver_exporter" make test
- make clean && ENABLE_JOURNALD=0 make
- ENABLE_JOURNALD=0 make test
- ENABLE_JOURNALD=0 make build-binaries

View File

@ -29,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Windows build now supported.
- Added metrics to retrieve stats such as `procs_running` and `procs_blocked`.
- Added metrics to retrieve network stats.
- Added metric to retrieve guest OS features such as unknown modules, ktd,
- Added metric to retrieve guest OS features such as unknwon modules, ktd,
and kernel integrity.
### Changed
@ -158,7 +158,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Empty LogPath will now use journald's default path.
- Systemd monitor now looks back 5 minutes.
- Bumped base image to `registry.k8s.io/debian-base-amd64:1.0.0`.
- Bumped base image to `k8s.gcr.io/debian-base-amd64:1.0.0`.
- Updated the detection method for docker overlay2 issues.
- Moved NPD into the kube-system namespace.
@ -237,7 +237,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Added resource limites to NPD deployment.
- Added log-counter to dockerfile.
- Added `enable_message_change_based_condition_update` option to enable
condition update when messages change for custom plugin.
condition update when messages cahnge for custom plugin.
### Fixed
@ -248,7 +248,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
### Changed
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.4.0`.
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.4.0`.
## [0.6.0] - 2018-11-27
@ -277,7 +277,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Changed default port from 10256 to 20256 to avoid conflict with kube-proxy.
- Bumped golang version from 1.8 to 1.9.
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.3`.
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.3`.
### Fixed

View File

@ -14,7 +14,7 @@ If your repo has certain guidelines for contribution, put them here ahead of the
- [Contributor License Agreement](https://git.k8s.io/community/CLA.md) Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests
- [Kubernetes Contributor Guide](http://git.k8s.io/community/contributors/guide) - Main contributor documentation, or you can just jump directly to the [contributing section](http://git.k8s.io/community/contributors/guide#contributing)
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet/README.md) - Common resources for existing developers
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet.md) - Common resources for existing developers
## Mentorship
@ -28,4 +28,4 @@ Custom Information - if you're copying this template for the first time you can
- [Slack channel](https://kubernetes.slack.com/messages/kubernetes-users) - Replace `kubernetes-users` with your slack channel string, this will send users directly to your channel.
- [Mailing list](URL)
-->
-->

View File

@ -12,42 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# "builder-base" can be overriden using dockerb buildx's --build-context flag,
# by users who want to use a different images for the builder. E.g. if you need to use an older OS
# to avoid dependencies on very recent glibc versions.
# E.g. of the param: --build-context builder-base=docker-image://golang:<something>@sha256:<something>
# Must override builder-base, not builder, since the latter is referred to later in the file and so must not be
# directly replaced. See here, and note that "stage" parameter mentioned there has been renamed to
# "build-context": https://github.com/docker/buildx/pull/904#issuecomment-1005871838
FROM golang:1.24-bookworm@sha256:00eccd446e023d3cd9566c25a6e6a02b90db3e1e0bbe26a48fc29cd96e800901 as builder-base
FROM builder-base as builder
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
ARG BASEIMAGE
FROM ${BASEIMAGE}
ARG TARGETARCH
MAINTAINER Random Liu <lantaol@google.com>
ENV GOPATH /gopath/
ENV PATH $GOPATH/bin:$PATH
RUN apt-get update --fix-missing && apt-get --yes install libsystemd-dev gcc-aarch64-linux-gnu
RUN go version
COPY . /gopath/src/k8s.io/node-problem-detector/
WORKDIR /gopath/src/k8s.io/node-problem-detector
RUN GOARCH=${TARGETARCH} make bin/node-problem-detector bin/health-checker bin/log-counter
FROM --platform=${TARGETPLATFORM} registry.k8s.io/build-image/debian-base:bookworm-v1.0.4@sha256:0a17678966f63e82e9c5e246d9e654836a33e13650a698adefede61bb5ca099e as base
LABEL maintainer="Random Liu <lantaol@google.com>"
RUN clean-install util-linux bash libsystemd-dev
RUN clean-install util-linux libsystemd0 bash
# Avoid symlink of /etc/localtime.
RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/node-problem-detector /node-problem-detector
COPY ./bin/node-problem-detector /node-problem-detector
ARG LOGCOUNTER
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/health-checker /gopath/src/k8s.io/node-problem-detector/${LOGCOUNTER} /home/kubernetes/bin/
COPY ./bin/health-checker ${LOGCOUNTER} /home/kubernetes/bin/
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/config/ /config
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json"]
COPY config /config
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json"]

140
Makefile
View File

@ -17,16 +17,12 @@
.PHONY: all \
vet fmt version test e2e-test \
build-binaries build-container build-tar build \
docker-builder build-in-docker \
push-container push-tar push release clean depup \
print-tar-sha-md5
docker-builder build-in-docker push-container push-tar push clean
all: build
# PLATFORMS is the set of OS_ARCH that NPD can build against.
LINUX_PLATFORMS=linux_amd64 linux_arm64
DOCKER_PLATFORMS=linux/amd64,linux/arm64
PLATFORMS=$(LINUX_PLATFORMS) windows_amd64
PLATFORMS=linux_amd64 windows_amd64
# VERSION is the version of the binary.
VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else echo "UNKNOWN"; fi)
@ -67,24 +63,21 @@ IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
# support needs libsystemd-dev or libsystemd-journal-dev.
ENABLE_JOURNALD?=1
ifeq ($(shell go env GOHOSTOS), darwin)
ifeq ($(go env GOHOSTOS), darwin)
ENABLE_JOURNALD=0
else ifeq ($(shell go env GOHOSTOS), windows)
else ifeq ($(go env GOHOSTOS), windows)
ENABLE_JOURNALD=0
endif
# TODO(random-liu): Support different architectures.
# The debian-base:v1.0.0 image built from kubernetes repository is based on
# Debian Stretch. It includes systemd 232 with support for both +XZ and +LZ4
# compression. +LZ4 is needed on some os distros such as COS.
BASEIMAGE:=k8s.gcr.io/debian-base-amd64:v1.0.0
# Disable cgo by default to make the binary statically linked.
CGO_ENABLED:=0
ifeq ($(GOARCH), arm64)
CC:=aarch64-linux-gnu-gcc
else
CC:=x86_64-linux-gnu-gcc
endif
# Set default Go architecture to AMD64.
GOARCH ?= amd64
# Construct the "-tags" parameter used by "go build".
BUILD_TAGS?=
@ -108,15 +101,15 @@ ifeq ($(ENABLE_JOURNALD), 1)
CGO_ENABLED:=1
LOGCOUNTER=./bin/log-counter
else
# Hack: Don't copy over log-counter, use a wildcard path that shouldn't match
# Hack: Don't copy over log-counter, use a wildcard path that shouldnt match
# anything in COPY command.
LOGCOUNTER=*dont-include-log-counter
endif
vet:
go list -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
GO111MODULE=on go list -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
grep -v "./vendor/*" | \
xargs go vet -tags "$(HOST_PLATFORM_BUILD_TAGS)"
GO111MODULE=on xargs go vet -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)"
fmt:
find . -type f -name "*.go" | grep -v "./vendor/*" | xargs gofmt -s -w -l
@ -130,13 +123,12 @@ ifeq ($(ENABLE_JOURNALD), 1)
BINARIES_LINUX_ONLY += bin/log-counter
endif
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) \
$(foreach platform, $(LINUX_PLATFORMS), $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/$(platform)/$(binary))) \
$(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/linux_amd64/$(binary)) $(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
ALL_TARBALLS = $(foreach platform, $(PLATFORMS), $(NPD_NAME_VERSION)-$(platform).tar.gz)
output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
-mod vendor \
-o $@ \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(WINDOWS_BUILD_TAGS)" \
@ -144,15 +136,15 @@ output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
touch $@
output/windows_amd64/test/bin/%.exe: $(PKG_SOURCES)
cd test && \
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
-o ../$@ \
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
-mod vendor \
-o $@ \
-tags "$(WINDOWS_BUILD_TAGS)" \
./e2e/$(subst -,,$*)
./test/e2e/$(subst -,,$*)
output/linux_amd64/bin/%: $(PKG_SOURCES)
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
CC=x86_64-linux-gnu-gcc go build \
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
-mod vendor \
-o $@ \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(LINUX_BUILD_TAGS)" \
@ -160,34 +152,17 @@ output/linux_amd64/bin/%: $(PKG_SOURCES)
touch $@
output/linux_amd64/test/bin/%: $(PKG_SOURCES)
cd test && \
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
CC=x86_64-linux-gnu-gcc go build \
-o ../$@ \
-tags "$(LINUX_BUILD_TAGS)" \
./e2e/$(subst -,,$*)
output/linux_arm64/bin/%: $(PKG_SOURCES)
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
CC=aarch64-linux-gnu-gcc go build \
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
-mod vendor \
-o $@ \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(LINUX_BUILD_TAGS)" \
./cmd/$(subst -,,$*)
touch $@
output/linux_arm64/test/bin/%: $(PKG_SOURCES)
cd test && \
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
CC=aarch64-linux-gnu-gcc go build \
-o ../$@ \
-tags "$(LINUX_BUILD_TAGS)" \
./e2e/$(subst -,,$*)
./test/e2e/$(subst -,,$*)
# In the future these targets should be deprecated.
./bin/log-counter: $(PKG_SOURCES)
ifeq ($(ENABLE_JOURNALD), 1)
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o bin/log-counter \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(LINUX_BUILD_TAGS)" \
@ -197,37 +172,38 @@ else
endif
./bin/node-problem-detector: $(PKG_SOURCES)
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o bin/node-problem-detector \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(LINUX_BUILD_TAGS)" \
./cmd/nodeproblemdetector
./test/bin/problem-maker: $(PKG_SOURCES)
cd test && \
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
-o bin/problem-maker \
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o test/bin/problem-maker \
-tags "$(LINUX_BUILD_TAGS)" \
./e2e/problemmaker/problem_maker.go
./test/e2e/problemmaker/problem_maker.go
./bin/health-checker: $(PKG_SOURCES)
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o bin/health-checker \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(LINUX_BUILD_TAGS)" \
cmd/healthchecker/health_checker.go
test: vet fmt
go test -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
e2e-test: vet fmt build-tar
cd test && \
go run github.com/onsi/ginkgo/ginkgo -nodes=$(PARALLEL) -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
./e2e/metriconly/... -- \
GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
./test/e2e/metriconly/... -- \
-project=$(PROJECT) -zone=$(ZONE) \
-image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \
-ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \
-npd-build-tar=`pwd`/../$(TARBALL) \
-npd-build-tar=`pwd`/$(TARBALL) \
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
-artifacts-dir=$(ARTIFACTS)
@ -240,9 +216,8 @@ $(NPD_NAME_VERSION)-%.tar.gz: $(ALL_BINARIES) test/e2e-install.sh
build-binaries: $(ALL_BINARIES)
build-container: clean Dockerfile
docker buildx create --platform $(DOCKER_PLATFORMS) --use
docker buildx build --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
build-container: build-binaries Dockerfile
docker build -t $(IMAGE) --build-arg BASEIMAGE=$(BASEIMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
$(TARBALL): ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
@ -254,7 +229,7 @@ build-tar: $(TARBALL) $(ALL_TARBALLS)
build: build-container build-tar
docker-builder:
docker build -t npd-builder . --target=builder
docker build -t npd-builder ./builder
build-in-docker: clean docker-builder
docker run \
@ -262,46 +237,17 @@ build-in-docker: clean docker-builder
-c 'cd /gopath/src/k8s.io/node-problem-detector/ && make build-binaries'
push-container: build-container
# So we can push to docker hub by setting REGISTRY
ifneq (,$(findstring gcr.io,$(REGISTRY)))
gcloud auth configure-docker
endif
# Build should be cached from build-container
docker buildx build --push --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
docker push $(IMAGE)
push-tar: build-tar
gsutil cp $(TARBALL) $(UPLOAD_PATH)/node-problem-detector/
gsutil cp node-problem-detector-$(VERSION)-*.tar.gz* $(UPLOAD_PATH)/node-problem-detector/
# `make push` is used by presubmit and CI jobs.
push: push-container push-tar
# `make release` is used when releasing a new NPD version.
release: push-container build-tar print-tar-sha-md5
print-tar-sha-md5: build-tar
./hack/print-tar-sha-md5.sh $(VERSION)
coverage.out:
rm -f coverage.out
go test -coverprofile=coverage.out -timeout=1m -v -short ./...
clean:
rm -rf bin/
rm -rf test/bin/
rm -f node-problem-detector-*.tar.gz*
rm -rf output/
rm -f coverage.out
.PHONY: gomod
gomod:
go mod tidy
go mod vendor
cd test; go mod tidy
.PHONY: goget
goget:
go get $(shell go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all)
.PHONY: depup
depup: goget gomod

10
OWNERS
View File

@ -1,14 +1,12 @@
reviewers:
- sig-node-reviewers
- Random-Liu
- dchen1107
- andyxning
- wangzhen127
- xueweiz
- vteratipally
- mmiranda96
- hakman
approvers:
- sig-node-approvers
- Random-Liu
- dchen1107
- andyxning
- wangzhen127
- xueweiz
- vteratipally

View File

@ -1,19 +0,0 @@
aliases:
sig-node-approvers:
- Random-Liu
- dchen1107
- derekwaynecarr
- yujuhong
- sjenning
- mrunalp
- klueska
- SergeyKanzhelev
- tallclair
sig-node-reviewers:
- Random-Liu
- dchen1107
- derekwaynecarr
- yujuhong
- sjenning
- mrunalp
- klueska

View File

@ -7,11 +7,11 @@ layers in the cluster management stack.
It is a daemon that runs on each node, detects node
problems and reports them to apiserver.
node-problem-detector can either run as a
[DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) or run standalone.
[DaemonSet](http://kubernetes.io/docs/admin/daemons/) or run standalone.
Now it is running as a
[Kubernetes Addon](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
enabled by default in the GKE cluster. It is also enabled by default in AKS as part of the
[AKS Linux Extension](https://learn.microsoft.com/en-us/azure/aks/faq#what-is-the-purpose-of-the-aks-linux-extension-i-see-installed-on-my-linux-vmss-instances).
enabled by default in the GCE cluster.
# Background
There are tons of node problems that could possibly affect the pods running on the
@ -41,8 +41,8 @@ should be reported as `Event`.
# Problem Daemon
A problem daemon is a sub-daemon of node-problem-detector. It monitors specific
kinds of node problems and reports them to node-problem-detector.
A problem daemon is a sub-daemon of node-problem-detector. It monitors a specific
kind of node problems and reports them to node-problem-detector.
A problem daemon could be:
* A tiny daemon designed for dedicated Kubernetes use-cases.
@ -62,9 +62,9 @@ List of supported problem daemons types:
| Problem Daemon Types | NodeCondition | Description | Configs | Disabling Build Tag |
|----------------|:---------------:|:------------|:--------|:--------------------|
| [SystemLogMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemlogmonitor) | KernelDeadlock ReadonlyFilesystem FrequentKubeletRestart FrequentDockerRestart FrequentContainerdRestart | A system log monitor monitors system log and reports problems and metrics according to predefined rules. | [filelog](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-filelog.json), [kmsg](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json), [kernel](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-counter.json) [abrt](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) [systemd](https://github.com/kubernetes/node-problem-detector/blob/master/config/systemd-monitor-counter.json) | disable_system_log_monitor
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | [system-stats-monitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | disable_system_stats_monitor
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | | disable_system_stats_monitor
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor) | On-demand(According to users configuration), existing example: NTPProblem | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user-defined check scripts. See the proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | [example](https://github.com/kubernetes/node-problem-detector/blob/4ad49bbd84b8ced45ac825eac01ec93d9235935e/config/custom-plugin-monitor.json) | disable_custom_plugin_monitor
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) [containerd](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-containerd.json) |
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) |
# Exporter
@ -102,14 +102,9 @@ certain backends. Some of them can be disabled at compile-time using a build tag
* `--config.custom-plugin-monitor`: List of paths to custom plugin monitor config files, comma-separated, e.g.
[config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
Node problem detector will start a separate custom plugin monitor for each configuration. You can
Node problem detector will start a separate custom plugin monitor for each configuration. You can
use different custom plugin monitors to monitor different node problems.
#### For Health Checkers
Health checkers are configured as custom plugins, using the config/health-checker-*.json config files.
#### For Kubernetes exporter
* `--enable-k8s-exporter`: Enables reporting to Kubernetes API server, default to `true`.
@ -142,12 +137,12 @@ For example, to run without auth, use the following config:
## Build Image
* Install development dependencies for `libsystemd` and the ARM GCC toolchain
* Debian/Ubuntu: `apt install libsystemd-dev gcc-aarch64-linux-gnu`
* `go get` or `git clone` node-problem-detector repo into `$GOPATH/src/k8s.io` or `$GOROOT/src/k8s.io`
with one of the below directions:
* `cd $GOPATH/src/k8s.io && git clone git@github.com:kubernetes/node-problem-detector.git`
* `cd $GOPATH/src/k8s.io && go get k8s.io/node-problem-detector`
* `git clone git@github.com:kubernetes/node-problem-detector.git`
* Run `make` in the top directory. It will:
* run `make` in the top directory. It will:
* Build the binary.
* Build the docker image. The binary and `config/` are copied into the docker image.
@ -163,6 +158,11 @@ and [System Stats Monitor](https://github.com/kubernetes/node-problem-detector/t
Check out the [Problem Daemon](https://github.com/kubernetes/node-problem-detector#problem-daemon) section
to see how to disable each problem daemon during compilation time.
**Note**:
By default, node-problem-detector will be built with systemd support with the `make` command. This requires systemd develop files.
You should download the systemd develop files first. For Ubuntu, the `libsystemd-journal-dev` package should
be installed. For Debian, the `libsystemd-dev` package should be installed.
## Push Image
`make push` uploads the docker image to a registry. By default, the image will be uploaded to
@ -175,7 +175,7 @@ The easiest way to install node-problem-detector into your cluster is to use the
```
helm repo add deliveryhero https://charts.deliveryhero.io/
helm install --generate-name deliveryhero/node-problem-detector
helm install deliveryhero/node-problem-detector
```
Alternatively, to install node-problem-detector manually:
@ -184,13 +184,9 @@ Alternatively, to install node-problem-detector manually:
2. Edit [node-problem-detector-config.yaml](deployment/node-problem-detector-config.yaml) to configure node-problem-detector.
3. Edit [rbac.yaml](deployment/rbac.yaml) to fit your environment.
3. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
4. Create the ServiceAccount and ClusterRoleBinding with `kubectl create -f rbac.yaml`.
4. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
5. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
3. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
## Start Standalone
@ -218,7 +214,7 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
* [Go](https://golang.org/)
* [Visual Studio Code](https://code.visualstudio.com/)
* [Make](http://gnuwin32.sourceforge.net/packages/make.htm)
* [mingw-64 WinBuilds](http://mingw-w64.org/downloads)
* [mingw-64 WinBuilds](http://mingw-w64.org/doku.php/download/win-builds)
* Tested with x86-64 Windows Native mode.
* Add the `$InstallDir\bin` to [Windows `PATH` variable](https://answers.microsoft.com/en-us/windows/forum/windows_10-other_settings-winpc/adding-path-variable/97300613-20cb-4d85-8d0e-cc9d3549ba23).
@ -226,16 +222,16 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
# Run these commands in the node-problem-detector directory.
# Build in MINGW64 Window
make clean ENABLE_JOURNALD=0 build-binaries
make clean windows-binaries
# Test in MINGW64 Window
make test
# Run with containerd log monitoring enabled in Command Prompt. (Assumes containerd is installed.)
%CD%\output\windows_amd64\bin\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
%CD%\output\windows_amd64\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
# Configure NPD to run as a Windows Service
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
sc.exe failure NodeProblemDetector reset= 0 actions= restart/10000
sc.exe start NodeProblemDetector
```
@ -268,9 +264,9 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
node-problem-detector uses [go modules](https://github.com/golang/go/wiki/Modules)
to manage dependencies. Therefore, building node-problem-detector requires
golang 1.11+. It still uses vendoring. See the
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-architecture/917-go-modules#alternatives-to-vendoring-using-go-modules)
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-architecture/2019-03-19-go-modules.md#alternatives-to-vendoring-using-go-modules)
for the design decisions. To add a new dependency, update [go.mod](go.mod) and
run `go mod vendor`.
run `GO111MODULE=on go mod vendor`.
# Remedy Systems
@ -279,26 +275,30 @@ detected by the node-problem-detector. Remedy systems observe events and/or node
conditions emitted by the node-problem-detector and take action to return the
Kubernetes cluster to a healthy state. The following remedy systems exist:
* [**Draino**](https://github.com/planetlabs/draino) automatically drains Kubernetes
nodes based on labels and node conditions. Nodes that match _all_ of the supplied
labels and _any_ of the supplied node conditions will be prevented from accepting
new pods (aka 'cordoned') immediately, and
[drained](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
after a configurable time. Draino can be used in conjunction with the
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
to automatically terminate drained nodes. Refer to
[this issue](https://github.com/kubernetes/node-problem-detector/issues/199)
for an example production use case for Draino.
* [**Descheduler**](https://github.com/kubernetes-sigs/descheduler) strategy RemovePodsViolatingNodeTaints
evicts pods violating NoSchedule taints on nodes. The k8s scheduler's TaintNodesByCondition feature must
be enabled. The [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
can be used to automatically terminate drained nodes.
* [**mediK8S**](https://github.com/medik8s) is an umbrella project for automatic remediation
system build on [Node Health Check Operator (NHC)](https://github.com/medik8s/node-healthcheck-operator) that monitors
node conditions and delegates remediation to external remediators using the Remediation API.[Poison-Pill](https://github.com/medik8s/poison-pill)
is a remediator that will reboot the node and make sure all statefull workloads are rescheduled. NHC supports conditionally remediating if the cluster
has enough healthy capacity, or manually pausing any action to minimze cluster disruption.
* [**MachineHealthCheck**](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check) of [Cluster API](https://cluster-api.sigs.k8s.io/) are responsible for remediating unhealthy Machines.
# Testing
NPD is tested via unit tests, [NPD e2e tests](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md), Kubernetes e2e tests and Kubernetes nodes e2e tests. Prow handles the [pre-submit tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-presubmits.yaml) and [CI tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-ci.yaml).
CI test results can be found below:
1. [Unit tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-test)
2. [NPD e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-test)
3. [Kubernetes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
4. [Kubernetes nodes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-node)
1. [Unit tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-test)
2. [NPD e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-test)
3. [Kubernetes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
4. [Kubernetes nodes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-node)
## Running tests
@ -310,10 +310,6 @@ See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-dete
[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY intended to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems.
# Compatibility
Node problem detector's architecture has been fairly stable. Recent versions (v0.8.13+) should be able to work with any supported kubernetes versions.
# Docs
* [Custom plugin monitor](docs/custom_plugin_monitor.md)
@ -324,4 +320,4 @@ Node problem detector's architecture has been fairly stable. Recent versions (v0
* [Slides](https://docs.google.com/presentation/d/1bkJibjwWXy8YnB5fna6p-Ltiy-N5p01zUsA22wCNkXA/edit?usp=sharing)
* [Plugin Interface Proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
* [Addon Manifest](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/node-problem-detector)
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)

25
builder/Dockerfile Normal file
View File

@ -0,0 +1,25 @@
# Copyright 2018 The Kubernetes Authors. All rights reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM golang:1.11.0
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
ENV GOPATH /gopath/
ENV PATH $GOPATH/bin:$PATH
RUN apt-get update && apt-get --yes install libsystemd-dev
RUN go version
RUN go get github.com/tools/godep
RUN godep version
CMD ["/bin/bash"]

View File

@ -1,26 +0,0 @@
# See https://cloud.google.com/cloud-build/docs/build-config
# this must be specified in seconds. If omitted, defaults to 600s (10 mins)
timeout: 3600s
options:
# job builds a multi-arch docker image for amd64 and arm64
machineType: E2_HIGHCPU_8
steps:
- name: 'gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20230623-56e06d7c18'
entrypoint: bash
env:
- PROW_GIT_TAG=$_GIT_TAG
- PULL_BASE_REF=$_PULL_BASE_REF
- VERSION=$_PULL_BASE_REF
- DOCKER_CLI_EXPERIMENTAL=enabled
args:
- -c
- |
echo "Building/Pushing NPD containers"
apk add musl-dev gcc
make push-container
substitutions:
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
# can be used as a substitution
_GIT_TAG: 'PLACE_HOLDER'
_PULL_BASE_REF: 'master'

View File

@ -23,24 +23,17 @@ import (
"github.com/spf13/pflag"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
"k8s.io/node-problem-detector/pkg/healthchecker"
)
func main() {
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
klog.InitFlags(klogFlags)
klogFlags.VisitAll(func(f *flag.Flag) {
switch f.Name {
case "v", "vmodule", "logtostderr":
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
}
})
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.CommandLine.MarkHidden("vmodule")
pflag.CommandLine.MarkHidden("logtostderr")
// Set glog flag so that it does not log to files.
if err := flag.Set("logtostderr", "true"); err != nil {
fmt.Printf("Failed to set logtostderr=true: %v", err)
os.Exit(int(types.Unknown))
}
hco := options.NewHealthCheckerOptions()
hco.AddFlags(pflag.CommandLine)

View File

@ -39,9 +39,7 @@ type HealthCheckerOptions struct {
EnableRepair bool
CriCtlPath string
CriSocketPath string
CriTimeout time.Duration
CoolDownTime time.Duration
LoopBackTime time.Duration
HealthCheckTimeout time.Duration
LogPatterns types.LogPatternFlag
}
@ -63,12 +61,8 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
"The path to the crictl binary. This is used to check health of cri component.")
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
"The path to the cri socket. Used with crictl to specify the socket path.")
fs.DurationVar(&hco.CriTimeout, "cri-timeout", types.DefaultCriTimeout,
"The duration to wait for crictl to run.")
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
"The duration to wait for the service to be up before attempting repair.")
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
"The duration to loop back, if it is 0, health-check will check from start time.")
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
"The time to wait before marking the component as unhealthy.")
fs.Var(&hco.LogPatterns, "log-pattern",

View File

@ -1,4 +1,3 @@
//go:build journald
// +build journald
/*
@ -26,24 +25,17 @@ import (
"github.com/spf13/pflag"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/logcounter/options"
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
"k8s.io/node-problem-detector/pkg/logcounter"
)
func main() {
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
klog.InitFlags(klogFlags)
klogFlags.VisitAll(func(f *flag.Flag) {
switch f.Name {
case "v", "vmodule", "logtostderr":
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
}
})
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.CommandLine.MarkHidden("vmodule")
pflag.CommandLine.MarkHidden("logtostderr")
// Set glog flag so that it does not log to files.
if err := flag.Set("logtostderr", "true"); err != nil {
fmt.Printf("Failed to set logtostderr=true: %v", err)
os.Exit(int(types.Unknown))
}
fedo := options.NewLogCounterOptions()
fedo.AddFlags(pflag.CommandLine)

View File

@ -34,7 +34,6 @@ type LogCounterOptions struct {
Lookback string
Delay string
Pattern string
RevertPattern string
Count int
}
@ -47,8 +46,6 @@ func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
"The time duration log watcher delays after node boot time. This is useful when log watcher needs to wait for some time until the node is stable.")
fs.StringVar(&fedo.Pattern, "pattern", "",
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
fs.StringVar(&fedo.RevertPattern, "revert-pattern", "",
"Similar to --pattern but conversely it decreases count value for every match. This is useful to discount a log when another log occurs.")
fs.IntVar(&fedo.Count, "count", 1,
"The number of times the pattern must be found to trigger the condition")
}

View File

@ -1,4 +1,3 @@
//go:build !disable_stackdriver_exporter
// +build !disable_stackdriver_exporter
/*

View File

@ -17,9 +17,7 @@ limitations under the License.
package main
import (
"context"
"k8s.io/klog/v2"
"github.com/golang/glog"
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/exporterplugins"
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/problemdaemonplugins"
@ -33,7 +31,16 @@ import (
"k8s.io/node-problem-detector/pkg/version"
)
func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) error {
func npdInteractive(npdo *options.NodeProblemDetectorOptions) {
termCh := make(chan error, 1)
defer close(termCh)
if err := npdMain(npdo, termCh); err != nil {
glog.Fatalf("Problem detector failed with error: %v", err)
}
}
func npdMain(npdo *options.NodeProblemDetectorOptions, termCh <-chan error) error {
if npdo.PrintVersion {
version.PrintVersion()
return nil
@ -46,18 +53,18 @@ func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) erro
// Initialize problem daemons.
problemDaemons := problemdaemon.NewProblemDaemons(npdo.MonitorConfigPaths)
if len(problemDaemons) == 0 {
klog.Fatalf("No problem daemon is configured")
glog.Fatalf("No problem daemon is configured")
}
// Initialize exporters.
defaultExporters := []types.Exporter{}
if ke := k8sexporter.NewExporterOrDie(ctx, npdo); ke != nil {
if ke := k8sexporter.NewExporterOrDie(npdo); ke != nil {
defaultExporters = append(defaultExporters, ke)
klog.Info("K8s exporter started.")
glog.Info("K8s exporter started.")
}
if pe := prometheusexporter.NewExporterOrDie(npdo); pe != nil {
defaultExporters = append(defaultExporters, pe)
klog.Info("Prometheus exporter started.")
glog.Info("Prometheus exporter started.")
}
plugableExporters := exporters.NewExporters()
@ -67,10 +74,10 @@ func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) erro
npdExporters = append(npdExporters, plugableExporters...)
if len(npdExporters) == 0 {
klog.Fatalf("No exporter is successfully setup")
glog.Fatalf("No exporter is successfully setup")
}
// Initialize NPD core.
p := problemdetector.NewProblemDetector(problemDaemons, npdExporters)
return p.Run(ctx)
return p.Run(termCh)
}

View File

@ -0,0 +1,30 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"github.com/spf13/pflag"
"k8s.io/node-problem-detector/cmd/options"
)
func main() {
npdo := options.NewNodeProblemDetectorOptions()
npdo.AddFlags(pflag.CommandLine)
pflag.Parse()
npdInteractive(npdo)
}

View File

@ -1,4 +1,3 @@
//go:build !disable_system_log_monitor
// +build !disable_system_log_monitor
/*
@ -20,8 +19,9 @@ limitations under the License.
package main
import (
"context"
"errors"
"fmt"
"io/ioutil"
"os"
"strings"
"testing"
@ -81,22 +81,24 @@ func TestNPDMain(t *testing.T) {
npdo, cleanup := setupNPD(t)
defer cleanup()
ctx, cancelFunc := context.WithCancel(context.Background())
cancelFunc()
if err := npdMain(ctx, npdo); err != nil {
termCh := make(chan error, 2)
termCh <- errors.New("close")
defer close(termCh)
if err := npdMain(npdo, termCh); err != nil {
t.Errorf("termination signal should not return error got, %v", err)
}
}
func writeTempFile(t *testing.T, ext string, contents string) (string, error) {
f, err := os.CreateTemp("", "*."+ext)
f, err := ioutil.TempFile("", "*."+ext)
if err != nil {
return "", fmt.Errorf("cannot create temp file, %v", err)
}
fileName := f.Name()
if err := os.WriteFile(fileName, []byte(contents), 0644); err != nil {
if err := ioutil.WriteFile(fileName, []byte(contents), 0644); err != nil {
os.Remove(fileName)
return "", fmt.Errorf("cannot write config to temp file %s, %v", fileName, err)
}

View File

@ -1,50 +0,0 @@
//go:build unix
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"flag"
"github.com/spf13/pflag"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/options"
)
func main() {
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
klog.InitFlags(klogFlags)
klogFlags.VisitAll(func(f *flag.Flag) {
switch f.Name {
case "v", "vmodule", "logtostderr":
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
}
})
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.CommandLine.MarkHidden("vmodule")
pflag.CommandLine.MarkHidden("logtostderr")
npdo := options.NewNodeProblemDetectorOptions()
npdo.AddFlags(pflag.CommandLine)
pflag.Parse()
if err := npdMain(context.Background(), npdo); err != nil {
klog.Fatalf("Problem detector failed with error: %v", err)
}
}

View File

@ -17,17 +17,16 @@ limitations under the License.
package main
import (
"context"
"flag"
"errors"
"fmt"
"sync"
"time"
"github.com/golang/glog"
"github.com/spf13/pflag"
"golang.org/x/sys/windows/svc"
"golang.org/x/sys/windows/svc/debug"
"golang.org/x/sys/windows/svc/eventlog"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/options"
)
@ -44,18 +43,6 @@ var (
)
func main() {
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
klog.InitFlags(klogFlags)
klogFlags.VisitAll(func(f *flag.Flag) {
switch f.Name {
case "v", "vmodule", "logtostderr":
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
}
})
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.CommandLine.MarkHidden("vmodule")
pflag.CommandLine.MarkHidden("logtostderr")
npdo := options.NewNodeProblemDetectorOptions()
npdo.AddFlags(pflag.CommandLine)
@ -75,7 +62,7 @@ func main() {
func isRunningAsWindowsService() bool {
runningAsService, err := svc.IsWindowsService()
if err != nil {
klog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
glog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
return false
}
return runningAsService
@ -115,20 +102,26 @@ type npdService struct {
}
func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (bool, uint32) {
appTermCh := make(chan error, 1)
svcLoopTermCh := make(chan error, 1)
defer func() {
close(appTermCh)
close(svcLoopTermCh)
}()
changes <- svc.Status{State: svc.StartPending}
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
var appWG sync.WaitGroup
var svcWG sync.WaitGroup
options := s.options
ctx, cancelFunc := context.WithCancel(context.Background())
// NPD application goroutine.
appWG.Add(1)
go func() {
defer appWG.Done()
if err := npdMain(ctx, options); err != nil {
if err := npdMain(options, appTermCh); err != nil {
elog.Warning(windowsEventLogID, err.Error())
}
@ -139,36 +132,16 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
svcWG.Add(1)
go func() {
defer svcWG.Done()
for {
select {
case <-ctx.Done():
return
case c := <-r:
switch c.Cmd {
case svc.Interrogate:
changes <- c.CurrentStatus
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
time.Sleep(100 * time.Millisecond)
changes <- c.CurrentStatus
case svc.Stop, svc.Shutdown:
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
cancelFunc()
case svc.Pause:
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
case svc.Continue:
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
default:
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
}
}
}
serviceLoop(r, changes, appTermCh, svcLoopTermCh)
}()
// Wait for the application go routine to die.
appWG.Wait()
// Ensure that the service control loop is killed.
svcLoopTermCh <- nil
// Wait for the service control loop to terminate.
// Otherwise it's possible that the channel closures cause the application to panic.
svcWG.Wait()
@ -178,3 +151,31 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
return false, uint32(0)
}
func serviceLoop(r <-chan svc.ChangeRequest, changes chan<- svc.Status, appTermCh chan error, svcLoopTermCh chan error) {
for {
select {
case <-svcLoopTermCh:
return
case c := <-r:
switch c.Cmd {
case svc.Interrogate:
changes <- c.CurrentStatus
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
time.Sleep(100 * time.Millisecond)
changes <- c.CurrentStatus
case svc.Stop, svc.Shutdown:
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
appTermCh <- errors.New("stopping service")
case svc.Pause:
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
case svc.Continue:
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
default:
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
}
}
}
}

View File

@ -1,4 +1,3 @@
//go:build !disable_system_log_monitor
// +build !disable_system_log_monitor
/*

View File

@ -1,4 +1,3 @@
//go:build !disable_custom_plugin_monitor
// +build !disable_custom_plugin_monitor
/*

View File

@ -1,4 +1,3 @@
//go:build !disable_system_log_monitor
// +build !disable_system_log_monitor
/*

View File

@ -1,4 +1,3 @@
//go:build !disable_system_stats_monitor
// +build !disable_system_stats_monitor
/*

View File

@ -43,10 +43,6 @@ type NodeProblemDetectorOptions struct {
ServerPort int
// ServerAddress is the address to bind the node problem detector server.
ServerAddress string
// QPS is the maximum QPS to the master from client.
QPS float32
// Burst is the maximum burst for throttle.
Burst int
// exporter options
@ -65,10 +61,6 @@ type NodeProblemDetectorOptions struct {
APIServerWaitInterval time.Duration
// K8sExporterHeartbeatPeriod is the period at which the k8s exporter does forcibly sync with apiserver.
K8sExporterHeartbeatPeriod time.Duration
// K8sExporterWriteEvents determines whether to write Kubernetes Events for problems.
K8sExporterWriteEvents bool
// K8sExporterUpdateNodeConditions determines whether to update Kubernetes Node Conditions for problems.
K8sExporterUpdateNodeConditions bool
// prometheusExporter options
// PrometheusServerPort is the port to bind the Prometheus scrape endpoint. Use 0 to disable.
@ -121,8 +113,6 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.")
fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.")
fs.DurationVar(&npdo.K8sExporterHeartbeatPeriod, "k8s-exporter-heartbeat-period", 5*time.Minute, "The period at which k8s-exporter does forcibly sync with apiserver.")
fs.BoolVar(&npdo.K8sExporterWriteEvents, "k8s-exporter-write-events", true, "Whether to write Kubernetes Event objects with event details.")
fs.BoolVar(&npdo.K8sExporterUpdateNodeConditions, "k8s-exporter-update-node-conditions", true, "Whether to update Kubernetes Node conditions with event details.")
fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")
fs.StringVar(&npdo.HostnameOverride, "hostname-override",
"", "Custom node name used to override hostname")
@ -135,8 +125,6 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
20257, "The port to bind the Prometheus scrape endpoint. Prometheus exporter is enabled by default at port 20257. Use 0 to disable.")
fs.StringVar(&npdo.PrometheusServerAddress, "prometheus-address",
"127.0.0.1", "The address to bind the Prometheus scrape endpoint.")
fs.Float32Var(&npdo.QPS, "kube-api-qps", 500, "Maximum QPS to use while talking with Kubernetes API")
fs.IntVar(&npdo.Burst, "kube-api-burst", 500, "Maximum burst for throttle while talking with Kubernetes API")
for _, exporterName := range exporters.GetExporterNames() {
exporterHandler := exporters.GetExporterHandlerOrDie(exporterName)
exporterHandler.Options.SetFlags(fs)

View File

@ -31,7 +31,7 @@
},
{
"type": "temporary",
"reason": "KernelOops",
"reason": "Kerneloops",
"pattern": "System encountered a non-fatal error in \\S+"
}
]

View File

@ -1,28 +0,0 @@
{
"plugin": "filelog",
"pluginConfig": {
"timestamp": "^.{15}",
"message": "(?i)Currently unreadable.*sectors|(?i)Offline uncorrectable sectors",
"timestampFormat": "Jan _2 15:04:05"
},
"logPath": "/var/log/messages",
"lookback": "10h",
"bufferSize": 1,
"source": "disk-monitor",
"skipList": [ " audit:", " audit[" ],
"conditions": [
{
"type": "DiskBadBlock",
"reason": "DiskBadBlock",
"message": "Disk no bad block"
},
],
"rules": [
{
"type": "permanent",
"condition": "DiskBadBlock",
"reason": "DiskBadBlock",
"pattern": ".*([1-9]\\d{2,}) (Currently unreadable.*sectors|Offline uncorrectable sectors).*"
},
]
}

View File

@ -25,7 +25,6 @@
"--component=kubelet",
"--enable-repair=true",
"--cooldown-time=1m",
"--loopback-time=0",
"--health-check-timeout=10s"
],
"timeout": "3m"

View File

@ -1,20 +0,0 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "86400s",
"timeout": "5s",
"max_output_length": 80,
"concurrency": 1
},
"source": "iptables-mode-monitor",
"metricsReporting": true,
"conditions": [],
"rules": [
{
"type": "temporary",
"reason": "IPTablesVersionsMismatch",
"path": "./config/plugin/iptables_mode.sh",
"timeout": "5s"
}
]
}

View File

@ -42,6 +42,12 @@
"reason": "KernelOops",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",

View File

@ -12,14 +12,9 @@
"message": "kernel has no deadlock"
},
{
"type": "XfsShutdown",
"reason": "XfsHasNotShutDown",
"message": "XFS has not shutdown"
},
{
"type": "CperHardwareErrorFatal",
"reason": "CperHardwareHasNoFatalError",
"message": "UEFI CPER has no fatal error"
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
}
],
"rules": [
@ -63,38 +58,28 @@
"reason": "IOError",
"pattern": "Buffer I/O error .*"
},
{
"type": "permanent",
"condition": "XfsShutdown",
"reason": "XfsHasShutdown",
"pattern": "XFS .* Shutting down filesystem.?"
},
{
"type": "temporary",
"reason": "MemoryReadError",
"pattern": "CE memory read error .*"
},
{
"type": "temporary",
"reason": "CperHardwareErrorCorrected",
"pattern": ".*\\[Hardware Error\\]: event severity: corrected$"
},
{
"type": "temporary",
"reason": "CperHardwareErrorRecoverable",
"pattern": ".*\\[Hardware Error\\]: event severity: recoverable$"
},
{
"type": "permanent",
"condition": "CperHardwareErrorFatal",
"reason": "CperHardwareErrorFatal",
"pattern": ".*\\[Hardware Error\\]: event severity: fatal$"
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}

View File

@ -1,6 +1,5 @@
{
"net": {
"excludeInterfaceRegexp": "^(cali|tunl|veth)",
"metricsConfigs": {
"net/rx_bytes": {
"displayName": "net/rx_bytes"

View File

@ -20,7 +20,8 @@ if systemctl -q is-active "$SERVICE"; then
echo "$SERVICE is running"
exit $OK
else
# Does not differentiate stopped/failed service from non-existent
# Does not differenciate stopped/failed service from non-existent
echo "$SERVICE is not running"
exit $NONOK
fi

View File

@ -1,30 +0,0 @@
#!/bin/bash
# As of iptables 1.8, the iptables command line clients come in two different versions/modes: "legacy",
# which uses the kernel iptables API just like iptables 1.6 and earlier did, and "nft", which translates
# the iptables command-line API into the kernel nftables API.
# Because they connect to two different subsystems in the kernel, you cannot mix rules from different versions.
# Ref: https://github.com/kubernetes-sigs/iptables-wrappers
readonly OK=0
readonly NONOK=1
readonly UNKNOWN=2
# based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
readonly num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -c '^-' || true)
readonly num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -c '^-' || true)
if [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -gt 0 ]; then
echo "Found rules from both versions, iptables-legacy: ${num_legacy_lines} iptables-nft: ${num_nft_lines}"
echo $NONOK
elif [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -eq 0 ]; then
echo "Using iptables-legacy: ${num_legacy_lines} rules"
echo $OK
elif [ "$num_legacy_lines" -eq 0 ] && [ "$num_nft_lines" -gt 0 ]; then
echo "Using iptables-nft: ${num_nft_lines} rules"
echo $OK
else
echo "No iptables rules found"
echo $UNKNOWN
fi

View File

@ -1,23 +0,0 @@
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "readonly-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
}
],
"rules": [
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}

View File

@ -44,9 +44,6 @@
"disk/bytes_used": {
"displayName": "disk/bytes_used"
},
"disk/percent_used": {
"displayName": "disk/percent_used"
},
"disk/io_time": {
"displayName": "disk/io_time"
},
@ -91,9 +88,6 @@
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
},
"memory/percent_used": {
"displayName": "memory/percent_used"
}
}
},

View File

@ -37,8 +37,7 @@
"--lookback=20m",
"--delay=5m",
"--count=5",
"--pattern=Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet).",
"--revert-pattern=Stopping (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)..."
"--pattern=Started Kubernetes kubelet."
],
"timeout": "1m"
},
@ -52,8 +51,7 @@
"--log-path=/var/log/journal",
"--lookback=20m",
"--count=5",
"--pattern=Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)...",
"--revert-pattern=Stopping (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
"--pattern=Starting Docker Application Container Engine..."
],
"timeout": "1m"
},
@ -67,8 +65,7 @@
"--log-path=/var/log/journal",
"--lookback=20m",
"--count=5",
"--pattern=Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)...",
"--revert-pattern=Stopping (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
"--pattern=Starting containerd container runtime..."
],
"timeout": "1m"
}

View File

@ -13,17 +13,17 @@
{
"type": "temporary",
"reason": "KubeletStart",
"pattern": "Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)."
"pattern": "Started Kubernetes kubelet."
},
{
"type": "temporary",
"reason": "DockerStart",
"pattern": "Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
"pattern": "Starting Docker Application Container Engine..."
},
{
"type": "temporary",
"reason": "ContainerdStart",
"pattern": "Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
"pattern": "Starting containerd container runtime..."
}
]
}

View File

@ -8,7 +8,7 @@ Restart=always
RestartSec=10
ExecStart=/home/kubernetes/bin/node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false \
--exporter.stackdriver=/home/kubernetes/node-problem-detector/config/exporter/stackdriver-exporter.json \
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/readonly-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
--config.custom-plugin-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor-counter.json,/home/kubernetes/node-problem-detector/config/systemd-monitor-counter.json \
--config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json,/home/kubernetes/node-problem-detector/config/net-cgroup-system-stats-monitor.json

View File

@ -20,11 +20,6 @@
"type": "temporary",
"reason": "CorruptContainerImageLayer",
"pattern": ".*failed to pull and unpack image.*failed to extract layer.*archive/tar: invalid tar header.*"
},
{
"type": "temporary",
"reason": "HCSEmptyLayerchain",
"pattern": ".*Failed to unmarshall layerchain json - invalid character '\\x00' looking for beginning of value*"
}
]
}

View File

@ -13,7 +13,7 @@
{
"type": "temporary",
"reason": "WindowsDefenderThreatsDetected",
"path": "C:\\etc\\kubernetes\\node-problem-detector\\config\\plugin\\windows_defender_problem.ps1",
"path": "./config/plugin/windows_defender_problem.ps1",
"timeout": "3s"
}
]

View File

@ -44,9 +44,6 @@
"disk/bytes_used": {
"displayName": "disk/bytes_used"
},
"disk/percent_used": {
"displayName": "disk/percent_used"
},
"disk/io_time": {
"displayName": "disk/io_time"
},
@ -91,9 +88,6 @@
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
},
"memory/percent_used": {
"displayName": "memory/percent_used"
}
}
}

View File

@ -53,33 +53,15 @@ data:
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}
readonly-monitor.json: |
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "readonly-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
}
],
"rules": [
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",

View File

@ -1,104 +0,0 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-problem-detector
namespace: kube-system
labels:
app: node-problem-detector
spec:
selector:
matchLabels:
app: node-problem-detector
template:
metadata:
labels:
app: node-problem-detector
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
containers:
- name: node-problem-detector
command:
- /node-problem-detector
- --logtostderr
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
resources:
limits:
cpu: 10m
memory: 80Mi
requests:
cpu: 10m
memory: 80Mi
imagePullPolicy: Always
securityContext:
privileged: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
readOnly: true
- name: kmsg
mountPath: /dev/kmsg
readOnly: true
# Make sure node problem detector is in the same timezone
# with the host.
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: config
mountPath: /config
readOnly: true
- mountPath: /etc/machine-id
name: machine-id
readOnly: true
- mountPath: /run/systemd/system
name: systemd
- mountPath: /var/run/dbus/
name: dbus
mountPropagation: Bidirectional
volumes:
- name: log
# Config `log` to your system log directory
hostPath:
path: /var/log/
- name: kmsg
hostPath:
path: /dev/kmsg
- name: localtime
hostPath:
path: /etc/localtime
- name: config
configMap:
name: node-problem-detector-config
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: readonly-monitor.json
path: readonly-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
- name: machine-id
hostPath:
path: /etc/machine-id
type: "File"
- name: systemd
hostPath:
path: /run/systemd/system/
type: ""
- name: dbus
hostPath:
path: /var/run/dbus/
type: ""

View File

@ -28,8 +28,8 @@ spec:
command:
- /node-problem-detector
- --logtostderr
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
resources:
limits:
cpu: 10m
@ -60,7 +60,6 @@ spec:
- name: config
mountPath: /config
readOnly: true
serviceAccountName: node-problem-detector
volumes:
- name: log
# Config `log` to your system log directory
@ -78,8 +77,6 @@ spec:
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: readonly-monitor.json
path: readonly-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
tolerations:

View File

@ -1,19 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: npd-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system

View File

@ -1,62 +1,9 @@
# Custom Plugin Monitor
## Configuration
### Plugin Config
* `invoke_interval`: Interval at which custom plugins will be invoked.
* `timeout`: Time after which custom plugins invocation will be terminated and considered timeout.
* `timeout`: Time after which custom plugins invokation will be terminated and considered timeout.
* `max_output_length`: The maximum standard output size from custom plugins that NPD will be cut and use for condition status message.
* `concurrency`: The plugin worker number, i.e., how many custom plugins will be invoked concurrently.
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
* `skip_initial_status`: Flag controls whether condition will be emitted during plugin initialization.
### Annotated Plugin Configuration Example
```
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "30s",
"timeout": "5s",
"max_output_length": 80,
"concurrency": 3,
"enable_message_change_based_condition_update": false
},
"source": "ntp-custom-plugin-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "NTPProblem",
"reason": "NTPIsUp", // This is the default reason shown when healthy
"message": "ntp service is up" // This is the default message shown when healthy
}
],
"rules": [
{
"type": "temporary", // These are not shown unless there's an
// event so they always relate to a problem.
// There are no defaults since there is nothing
// to show unless there's a problem.
"reason": "NTPIsDown", // This is the reason shown for this event
// and the message shown comes from stdout.
"path": "./config/plugin/check_ntp.sh",
"timeout": "3s"
},
{
"type": "permanent", // These are permanent and are shown in the Conditions section
// when running `kubectl describe node ...`
// They have default values shown above in the conditions section
// and also a reason for each specific trigger listed in this rules section.
// Message will come from default for healthy times
// and during unhealthy time message comes from stdout of the check.
"condition": "NTPProblem", // This is the key to connect to the corresponding condition listed above
"reason": "NTPIsDown", // and the reason shown for failures detected in this rule
// and message will be from stdout of the check.
"path": "./config/plugin/check_ntp.sh",
"timeout": "3s"
}
]
}
```
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.

View File

@ -4,12 +4,6 @@ These are notes to help follow a consistent release process. See something
important missing? Please submit a pull request to add anything else that would
be useful!
## Prerequisites
Ensure access to the container image [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
Add email to `k8s-infra-staging-npd` group in sig-node [groups.yaml](https://github.com/kubernetes/k8s.io/blob/main/groups/sig-node/groups.yaml).
See example https://github.com/kubernetes/k8s.io/pull/1599.
## Preparing for a release
There are a few steps that should be taken prior to creating the actual release
@ -17,100 +11,37 @@ itself.
1. Collect changes since last release. This can be done by looking directly at
merged commit messages (``git log [last_release_tag]...HEAD``), or by
viewing the changes on GitHub (example: https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...master).
viewing the changes on GitHub ([example:
https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master](https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master)).
2. Based on the changes to be included in the release, determine what the next
1. Based on the changes to be included in the release, determine what the next
release number should be. We strive to follow [SemVer](https://semver.org/)
as much as possible.
3. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
1. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
with all significant changes.
## Create release
### Create the new version tag
#### Option 1
```
# Use v0.8.17 as an example.
git clone git@github.com:kubernetes/node-problem-detector.git
cd node-problem-detector/
git tag v0.8.17
git push origin v0.8.17
```
#### Option 2
Update [version.txt](https://github.com/kubernetes/node-problem-detector/blob/master/version.txt)
(example https://github.com/kubernetes/node-problem-detector/pull/869).
### Build and push artifacts
This step builds the NPD into container files and tar files.
- The container file is pushed to the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
You will promote the new image to registry.k8s.io later.
- The tar files are generated locally. You will upload those to github in the
release note later.
**Note: You need the access mentioned in the [prerequisites](#prerequisites)
section to perform steps in this section.**
```
# One-time setup
sudo apt-get install libsystemd-dev gcc-aarch64-linux-gnu
cd node-problem-detector
make release
# Get SHA256 of the tar files. For example
sha256sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
sha256sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
sha256sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
# Get MD5 of the tar files. For example
md5sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
md5sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
md5sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
# Verify container image in staging registry and get SHA256.
docker pull gcr.io/k8s-staging-npd/node-problem-detector:v0.8.17
docker image ls gcr.io/k8s-staging-npd/node-problem-detector --digests
```
### Promote new NPD image to registry.k8s.io
1. Get the SHA256 from the new NPD image from the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector)
or previous step.
2. Promote the NPD image to registry.k8s.io ([images.yaml](https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-npd/images.yaml), example https://github.com/kubernetes/k8s.io/pull/6523).
3. Verify the container image.
```
docker pull registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
docker image ls registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
```
### Create the release note
Go to https://github.com/kubernetes/node-problem-detector/releases, draft a new
release note and publish. Make sure to include the following in the body of the
release note:
Once changes have been merged to the CHANGELOG, perform the actual release via
GitHub. When creating the release, make sure to include the following in the
body of the release:
1. For convenience, add a link to easily view the changes since the last
release (e.g.
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17](https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17)).
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6](https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6)).
2. There is no need to duplicate everything from the CHANGELOG, but include the
1. There is no need to duplicate everything from the CHANGELOG, but include the
most significant things so someone just viewing the release entry will have
an idea of what it includes.
3. Provide a link to the new image release (e.g. `Image:
registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17`)
4. Upload the tar files built from [pevious step](#build-and-push-artifacts),
and include the SHA and MD5.
1. Provide a link to the new image release (e.g. `Image:
k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6`)
## Post release steps
1. Update image version in node-problem-detector repo, so anyone deploying
directly from the repo deployment file will get the newest image deployed.
Example https://github.com/kubernetes/node-problem-detector/pull/897.
1. Update image version in
[deployment/node-problem-detector.yaml](https://github.com/kubernetes/node-problem-detector/blob/422c088d623488be33aa697588655440c4e6a063/deployment/node-problem-detector.yaml#L32).
2. Update the NPD version in [kubernetes/kubernetes](https://github.com/kubernetes/kubernetes)
repo, so that kubernetes clusters use the new NPD version. Example
https://github.com/kubernetes/kubernetes/pull/123740.
Update the image version in the deployment file so anyone deploying directly
from the repo deployment file will get the newest image deployed.

135
go.mod
View File

@ -1,110 +1,41 @@
module k8s.io/node-problem-detector
go 1.24.2
go 1.15
require (
cloud.google.com/go/compute/metadata v0.6.0
contrib.go.opencensus.io/exporter/prometheus v0.4.2
contrib.go.opencensus.io/exporter/stackdriver v0.13.14
github.com/acobaugh/osrelease v0.1.0
github.com/avast/retry-go/v4 v4.6.1
github.com/coreos/go-systemd/v22 v22.5.0
cloud.google.com/go v0.45.1
code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c
contrib.go.opencensus.io/exporter/prometheus v0.0.0-20190427222117-f6cda26f80a3
contrib.go.opencensus.io/exporter/stackdriver v0.13.4
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
github.com/avast/retry-go v2.4.1+incompatible
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
github.com/euank/go-kmsg-parser v2.0.0+incompatible
github.com/go-ole/go-ole v1.2.4 // indirect
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
github.com/google/cadvisor v0.36.0
github.com/hpcloud/tail v1.0.0
github.com/prometheus/client_model v0.6.2
github.com/prometheus/common v0.63.0
github.com/prometheus/procfs v0.16.1
github.com/shirou/gopsutil/v3 v3.24.5
github.com/spf13/pflag v1.0.6
github.com/stretchr/testify v1.10.0
go.opencensus.io v0.24.0
golang.org/x/sys v0.32.0
google.golang.org/api v0.230.0
k8s.io/api v0.33.0
k8s.io/apimachinery v0.33.0
k8s.io/client-go v0.33.0
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e
github.com/onsi/ginkgo v1.10.3
github.com/onsi/gomega v1.7.1
github.com/pborman/uuid v1.2.0
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4
github.com/prometheus/common v0.4.1
github.com/prometheus/procfs v0.2.0
github.com/shirou/gopsutil v2.19.12+incompatible
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.6.1
github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc // indirect
go.opencensus.io v0.22.4
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
golang.org/x/sys v0.0.0-20201211090839-8ad439b19e0f
google.golang.org/api v0.10.0
k8s.io/api v0.0.0-20190816222004-e3a6b8045b0b
k8s.io/apimachinery v0.0.0-20190816221834-a9f1d8a9c101
k8s.io/client-go v11.0.1-0.20190805182717-6502b5e7b1b5+incompatible
k8s.io/heapster v0.0.0-20180704153620-b25f8a16208f
k8s.io/kubernetes v1.14.6
k8s.io/test-infra v0.0.0-20190914015041-e1cbc3ccd91c
)
require (
cloud.google.com/go/auth v0.16.0 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
cloud.google.com/go/monitoring v1.20.3 // indirect
cloud.google.com/go/trace v1.10.11 // indirect
github.com/aws/aws-sdk-go v1.44.72 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.5.1 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/s2a-go v0.1.9 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/prometheus/client_golang v1.20.4 // indirect
github.com/prometheus/prometheus v0.35.0 // indirect
github.com/prometheus/statsd_exporter v0.22.7 // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/tklauser/go-sysconf v0.3.12 // indirect
github.com/tklauser/numcpus v0.6.1 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
go.opentelemetry.io/otel v1.35.0 // indirect
go.opentelemetry.io/otel/metric v1.35.0 // indirect
go.opentelemetry.io/otel/trace v1.35.0 // indirect
golang.org/x/crypto v0.37.0 // indirect
golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.29.0 // indirect
golang.org/x/sync v0.13.0 // indirect
golang.org/x/term v0.31.0 // indirect
golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.11.0 // indirect
google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect
google.golang.org/grpc v1.72.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/fsnotify.v1 v1.4.7 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
replace git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999

1969
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -1,46 +0,0 @@
#!/bin/bash
# Copyright 2024 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail
VERSION="$1"
NPD_LINUX_AMD64=node-problem-detector-${VERSION}-linux_amd64.tar.gz
NPD_LINUX_ARM64=node-problem-detector-${VERSION}-linux_arm64.tar.gz
NPD_WINDOWS_AMD64=node-problem-detector-${VERSION}-windows_amd64.tar.gz
SHA_NPD_LINUX_AMD64=$(sha256sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
SHA_NPD_LINUX_ARM64=$(sha256sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
SHA_NPD_WINDOWS_AMD64=$(sha256sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
MD5_NPD_LINUX_AMD64=$(md5sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
MD5_NPD_LINUX_ARM64=$(md5sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
MD5_NPD_WINDOWS_AMD64=$(md5sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
echo
echo **${NPD_LINUX_AMD64}**:
echo **SHA**: ${SHA_NPD_LINUX_AMD64}
echo **MD5**: ${MD5_NPD_LINUX_AMD64}
echo
echo **${NPD_LINUX_ARM64}**:
echo **SHA**: ${SHA_NPD_LINUX_ARM64}
echo **MD5**: ${MD5_NPD_LINUX_ARM64}
echo
echo **${NPD_WINDOWS_AMD64}**:
echo **SHA**: ${SHA_NPD_WINDOWS_AMD64}
echo **MD5**: ${MD5_NPD_WINDOWS_AMD64}

View File

@ -1,32 +0,0 @@
#!/bin/bash -xe
# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
VERSION=$(cat version.txt)
if [[ ! "${VERSION}" =~ ^v([0-9]+[.][0-9]+)[.]([0-9]+)(-(alpha|beta)[.]([0-9]+))?$ ]]; then
echo "Version ${VERSION} must be 'X.Y.Z', 'X.Y.Z-alpha.N', or 'X.Y.Z-beta.N'"
exit 1
fi
if [ "$(git tag -l "${VERSION}")" ]; then
echo "Tag ${VERSION} already exists"
exit 1
fi
git tag -a -m "Release ${VERSION}" "${VERSION}"
git push origin "${VERSION}"
echo "release_tag=refs/tags/${VERSION}" >> $GITHUB_OUTPUT

View File

@ -1,30 +0,0 @@
#!/bin/bash
# Copyright 2024 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail
make gomod
changes=$(git status --porcelain go.mod go.sum vendor/ tests/e2e/go.mod tests/e2e/go.sum || true)
if [ -n "${changes}" ]; then
echo "ERROR: go modules are not up to date; please run: make gomod"
echo "changed files:"
printf "%s" "${changes}\n"
echo "git diff:"
git --no-pager diff
exit 1
fi

View File

@ -18,10 +18,10 @@ package custompluginmonitor
import (
"encoding/json"
"os"
"io/ioutil"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
@ -47,6 +47,7 @@ type customPluginMonitor struct {
config cpmtypes.CustomPluginConfig
conditions []types.Condition
plugin *plugin.Plugin
resultChan <-chan cpmtypes.Result
statusChan chan *types.Status
tomb *tomb.Tomb
}
@ -57,27 +58,27 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
configPath: configPath,
tomb: tomb.NewTomb(),
}
f, err := os.ReadFile(configPath)
f, err := ioutil.ReadFile(configPath)
if err != nil {
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
}
err = json.Unmarshal(f, &c.config)
if err != nil {
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
}
// Apply configurations
err = (&c.config).ApplyConfiguration()
if err != nil {
klog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
}
// Validate configurations
err = c.config.Validate()
if err != nil {
klog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
}
klog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
glog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
c.plugin = plugin.NewPlugin(c.config)
// A 1000 size channel should be big enough.
@ -96,39 +97,32 @@ func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
if rule.Type == types.Perm {
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
if err != nil {
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
rule.Condition, rule.Reason, err)
}
}
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
if err != nil {
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
}
}
}
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
klog.Infof("Start custom plugin monitor %s", c.configPath)
glog.Infof("Start custom plugin monitor %s", c.configPath)
go c.plugin.Run()
go c.monitorLoop()
return c.statusChan, nil
}
func (c *customPluginMonitor) Stop() {
klog.Infof("Stop custom plugin monitor %s", c.configPath)
glog.Infof("Stop custom plugin monitor %s", c.configPath)
c.tomb.Stop()
}
// monitorLoop is the main loop of customPluginMonitor.
// there is one customPluginMonitor, one plugin instance for each configPath.
// each runs rules in parallel at pre-configured concurrency, and interval.
func (c *customPluginMonitor) monitorLoop() {
c.initializeConditions()
if *c.config.PluginGlobalConfig.SkipInitialStatus {
klog.Infof("Skipping sending initial status. Using default conditions: %+v", c.conditions)
} else {
c.sendInitialStatus()
}
c.initializeStatus()
resultChan := c.plugin.GetResultChan()
@ -136,16 +130,16 @@ func (c *customPluginMonitor) monitorLoop() {
select {
case result, ok := <-resultChan:
if !ok {
klog.Errorf("Result channel closed: %s", c.configPath)
glog.Errorf("Result channel closed: %s", c.configPath)
return
}
klog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
status := c.generateStatus(result)
klog.V(3).Infof("New status generated: %+v", status)
glog.V(3).Infof("New status generated: %+v", status)
c.statusChan <- status
case <-c.tomb.Stopping():
c.plugin.Stop()
klog.Infof("Custom plugin monitor stopped: %s", c.configPath)
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
c.tomb.Done()
return
}
@ -238,7 +232,6 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
condition.Type,
status,
newReason,
newMessage,
timestamp,
)
@ -259,7 +252,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
event.Reason, 1)
if err != nil {
klog.Errorf("Failed to update problem counter metrics for %q: %v",
glog.Errorf("Failed to update problem counter metrics for %q: %v",
event.Reason, err)
}
}
@ -267,7 +260,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
condition.Type, condition.Reason, condition.Status == types.True)
if err != nil {
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
condition.Type, condition.Reason, err)
}
}
@ -280,7 +273,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
}
// Log only if condition has changed
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
klog.V(0).Infof("New status generated: %+v", status)
glog.V(0).Infof("New status generated: %+v", status)
}
return status
}
@ -296,9 +289,11 @@ func toConditionStatus(s cpmtypes.Status) types.ConditionStatus {
}
}
// sendInitialStatus sends the initial status to the node problem detector.
func (c *customPluginMonitor) sendInitialStatus() {
klog.Infof("Sending initial status for %s with conditions: %+v", c.config.Source, c.conditions)
// initializeStatus initializes the internal condition and also reports it to the node problem detector.
func (c *customPluginMonitor) initializeStatus() {
// Initialize the default node conditions
c.conditions = initialConditions(c.config.DefaultConditions)
glog.Infof("Initialize condition generated: %+v", c.conditions)
// Update the initial status
c.statusChan <- &types.Status{
Source: c.config.Source,
@ -306,12 +301,6 @@ func (c *customPluginMonitor) sendInitialStatus() {
}
}
// initializeConditions initializes the internal node conditions.
func (c *customPluginMonitor) initializeConditions() {
c.conditions = initialConditions(c.config.DefaultConditions)
klog.Infof("Initialized conditions for %s: %+v", c.configPath, c.conditions)
}
func initialConditions(defaults []types.Condition) []types.Condition {
conditions := make([]types.Condition, len(defaults))
copy(conditions, defaults)

View File

@ -20,13 +20,14 @@ import (
"context"
"fmt"
"io"
"io/ioutil"
"os/exec"
"strings"
"sync"
"syscall"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
"k8s.io/node-problem-detector/pkg/util"
"k8s.io/node-problem-detector/pkg/util/tomb"
@ -60,7 +61,7 @@ func (p *Plugin) GetResultChan() <-chan cpmtypes.Result {
func (p *Plugin) Run() {
defer func() {
klog.Info("Stopping plugin execution")
glog.Info("Stopping plugin execution")
close(p.resultChan)
p.tomb.Done()
}()
@ -89,10 +90,9 @@ func (p *Plugin) Run() {
// run each rule in parallel and wait for them to complete
func (p *Plugin) runRules() {
klog.V(3).Info("Start to run custom plugins")
glog.V(3).Info("Start to run custom plugins")
for _, rule := range p.config.Rules {
// syncChan limits concurrent goroutines to configured PluginGlobalConfig.Concurrency value
p.syncChan <- struct{}{}
p.Add(1)
go func(rule *cpmtypes.CustomRule) {
@ -103,12 +103,8 @@ func (p *Plugin) runRules() {
start := time.Now()
exitStatus, message := p.run(*rule)
level := klog.Level(3)
if exitStatus != 0 {
level = klog.Level(2)
}
klog.V(level).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
glog.V(3).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
result := cpmtypes.Result{
Rule: rule,
@ -116,27 +112,26 @@ func (p *Plugin) runRules() {
Message: message,
}
// pipes result into resultChan which customPluginMonitor instance generates status from
p.resultChan <- result
// Let the result be logged at a higher verbosity level. If there is a change in status it is logged later.
klog.V(level).Infof("Add check result %+v for rule %+v", result, rule)
glog.V(3).Infof("Add check result %+v for rule %+v", result, rule)
}(rule)
}
p.Wait()
klog.V(3).Info("Finish running custom plugins")
glog.V(3).Info("Finish running custom plugins")
}
// readFromReader reads the maxBytes from the reader and drains the rest.
func readFromReader(reader io.ReadCloser, maxBytes int64) ([]byte, error) {
limitReader := io.LimitReader(reader, maxBytes)
data, err := io.ReadAll(limitReader)
data, err := ioutil.ReadAll(limitReader)
if err != nil {
return []byte{}, err
}
// Drain the reader
if _, err := io.Copy(io.Discard, reader); err != nil {
if _, err := io.Copy(ioutil.Discard, reader); err != nil {
return []byte{}, err
}
return data, nil
@ -157,16 +152,16 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
klog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
glog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
return cpmtypes.Unknown, "Error creating stdout pipe for plugin. Please check the error log"
}
stderrPipe, err := cmd.StderrPipe()
if err != nil {
klog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
glog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
return cpmtypes.Unknown, "Error creating stderr pipe for plugin. Please check the error log"
}
if err := cmd.Start(); err != nil {
klog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
glog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
return cpmtypes.Unknown, "Error in starting plugin. Please check the error log"
}
@ -182,9 +177,9 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
if ctx.Err() == context.Canceled {
return
}
klog.Errorf("Error in running plugin timeout %q", rule.Path)
glog.Errorf("Error in running plugin timeout %q", rule.Path)
if cmd.Process == nil || cmd.Process.Pid == 0 {
klog.Errorf("Error in cmd.Process check %q", rule.Path)
glog.Errorf("Error in cmd.Process check %q", rule.Path)
break
}
@ -194,7 +189,7 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
err := util.Kill(cmd)
if err != nil {
klog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
glog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
}
case <-waitChan:
return
@ -223,18 +218,18 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
wg.Wait()
if stdoutErr != nil {
klog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
glog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
return cpmtypes.Unknown, "Error reading stdout for plugin. Please check the error log"
}
if stderrErr != nil {
klog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
glog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
return cpmtypes.Unknown, "Error reading stderr for plugin. Please check the error log"
}
if err := cmd.Wait(); err != nil {
if _, ok := err.(*exec.ExitError); !ok {
klog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
glog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
return cpmtypes.Unknown, "Error in waiting for plugin. Please check the error log"
}
}
@ -273,12 +268,12 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
// Stop the plugin.
func (p *Plugin) Stop() {
p.tomb.Stop()
klog.Info("Stop plugin execution")
glog.Info("Stop plugin execution")
}
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel klog.Level) {
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel glog.Level) {
if len(logs) != 0 {
klog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
klog.V(logLevel).Infof("End logs from plugin %+v", rule)
glog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
glog.V(logLevel).Infof("End logs from plugin %+v", rule)
}
}

View File

@ -33,7 +33,6 @@ var (
defaultConcurrency = 3
defaultMessageChangeBasedConditionUpdate = false
defaultEnableMetricsReporting = true
defaultSkipInitialStatus = false
customPluginName = "custom"
)
@ -53,11 +52,9 @@ type pluginGlobalConfig struct {
Concurrency *int `json:"concurrency,omitempty"`
// EnableMessageChangeBasedConditionUpdate indicates whether NPD should enable message change based condition update.
EnableMessageChangeBasedConditionUpdate *bool `json:"enable_message_change_based_condition_update,omitempty"`
// SkipInitialStatus prevents the first status update with default conditions
SkipInitialStatus *bool `json:"skip_initial_status,omitempty"`
}
// CustomPluginConfig is the configuration of custom plugin monitor.
// Custom plugin config is the configuration of custom plugin monitor.
type CustomPluginConfig struct {
// Plugin is the name of plugin which is currently used.
// Currently supported: custom.
@ -108,10 +105,6 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
cpc.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate = &defaultMessageChangeBasedConditionUpdate
}
if cpc.PluginGlobalConfig.SkipInitialStatus == nil {
cpc.PluginGlobalConfig.SkipInitialStatus = &defaultSkipInitialStatus
}
for _, rule := range cpc.Rules {
if rule.TimeoutString != nil {
timeout, err := time.ParseDuration(*rule.TimeoutString)

View File

@ -33,7 +33,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
concurrency := 2
messageChangeBasedConditionUpdate := true
disableMetricsReporting := false
disableInitialStatusUpdate := true
ruleTimeout := 1 * time.Second
ruleTimeoutString := ruleTimeout.String()
@ -63,7 +62,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
Rules: []*CustomRule{
@ -93,7 +91,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
@ -113,7 +110,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
@ -133,7 +129,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &maxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
@ -153,7 +148,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &concurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
@ -173,7 +167,6 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &messageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
@ -191,30 +184,10 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &defaultSkipInitialStatus,
},
EnableMetricsReporting: &disableMetricsReporting,
},
},
"disable status update during initialization": {
Orig: CustomPluginConfig{PluginGlobalConfig: pluginGlobalConfig{
SkipInitialStatus: &disableInitialStatusUpdate,
},
},
Wanted: CustomPluginConfig{
PluginGlobalConfig: pluginGlobalConfig{
InvokeIntervalString: &defaultInvokeIntervalString,
InvokeInterval: &defaultInvokeInterval,
TimeoutString: &defaultGlobalTimeoutString,
Timeout: &defaultGlobalTimeout,
MaxOutputLength: &defaultMaxOutputLength,
Concurrency: &defaultConcurrency,
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
SkipInitialStatus: &disableInitialStatusUpdate,
},
EnableMetricsReporting: &defaultEnableMetricsReporting,
},
},
}
for desp, utMeta := range utMetas {

View File

@ -17,9 +17,8 @@ limitations under the License.
package types
import (
"time"
"k8s.io/node-problem-detector/pkg/types"
"time"
)
type Status int

View File

@ -17,7 +17,6 @@ limitations under the License.
package condition
import (
"context"
"reflect"
"sync"
"time"
@ -26,10 +25,10 @@ import (
"k8s.io/node-problem-detector/pkg/types"
problemutil "k8s.io/node-problem-detector/pkg/util"
v1 "k8s.io/api/core/v1"
"k8s.io/utils/clock"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/klog/v2"
"github.com/golang/glog"
)
const (
@ -50,7 +49,7 @@ const (
// not. This addresses 3).
type ConditionManager interface {
// Start starts the condition manager.
Start(ctx context.Context)
Start()
// UpdateCondition updates a specific condition.
UpdateCondition(types.Condition)
// GetConditions returns all current conditions.
@ -68,7 +67,7 @@ type conditionManager struct {
// No lock is needed in `sync`, because it is in the same goroutine with the
// write operation.
sync.RWMutex
clock clock.WithTicker
clock clock.Clock
latestTry time.Time
resyncNeeded bool
client problemclient.Client
@ -79,18 +78,18 @@ type conditionManager struct {
}
// NewConditionManager creates a condition manager.
func NewConditionManager(client problemclient.Client, clockInUse clock.WithTicker, heartbeatPeriod time.Duration) ConditionManager {
func NewConditionManager(client problemclient.Client, clock clock.Clock, heartbeatPeriod time.Duration) ConditionManager {
return &conditionManager{
client: client,
clock: clockInUse,
clock: clock,
updates: make(map[string]types.Condition),
conditions: make(map[string]types.Condition),
heartbeatPeriod: heartbeatPeriod,
}
}
func (c *conditionManager) Start(ctx context.Context) {
go c.syncLoop(ctx)
func (c *conditionManager) Start() {
go c.syncLoop()
}
func (c *conditionManager) UpdateCondition(condition types.Condition) {
@ -111,17 +110,15 @@ func (c *conditionManager) GetConditions() []types.Condition {
return conditions
}
func (c *conditionManager) syncLoop(ctx context.Context) {
func (c *conditionManager) syncLoop() {
ticker := c.clock.NewTicker(updatePeriod)
defer ticker.Stop()
for {
select {
case <-ticker.C():
if c.needUpdates() || c.needResync() || c.needHeartbeat() {
c.sync(ctx)
c.sync()
}
case <-ctx.Done():
break
}
}
}
@ -153,16 +150,16 @@ func (c *conditionManager) needHeartbeat() bool {
}
// sync synchronizes node conditions with the apiserver.
func (c *conditionManager) sync(ctx context.Context) {
func (c *conditionManager) sync() {
c.latestTry = c.clock.Now()
c.resyncNeeded = false
conditions := []v1.NodeCondition{}
for i := range c.conditions {
conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))
}
if err := c.client.SetConditions(ctx, conditions); err != nil {
if err := c.client.SetConditions(conditions); err != nil {
// The conditions will be updated again in future sync
klog.Errorf("failed to update node conditions: %v", err)
glog.Errorf("failed to update node conditions: %v", err)
c.resyncNeeded = true
return
}

View File

@ -17,7 +17,6 @@ limitations under the License.
package condition
import (
"context"
"fmt"
"testing"
"time"
@ -29,14 +28,14 @@ import (
problemutil "k8s.io/node-problem-detector/pkg/util"
v1 "k8s.io/api/core/v1"
testclock "k8s.io/utils/clock/testing"
"k8s.io/apimachinery/pkg/util/clock"
)
const heartbeatPeriod = 1 * time.Minute
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *testclock.FakeClock) {
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *clock.FakeClock) {
fakeClient := problemclient.NewFakeProblemClient()
fakeClock := testclock.NewFakeClock(time.Now())
fakeClock := clock.NewFakeClock(time.Now())
manager := NewConditionManager(fakeClient, fakeClock, heartbeatPeriod)
return manager.(*conditionManager), fakeClient, fakeClock
}
@ -110,7 +109,7 @@ func TestResync(t *testing.T) {
m, fakeClient, fakeClock := newTestManager()
condition := newTestCondition("TestCondition")
m.conditions = map[string]types.Condition{condition.Type: condition}
m.sync(context.Background())
m.sync()
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
@ -119,7 +118,7 @@ func TestResync(t *testing.T) {
assert.False(t, m.needResync(), "Should not resync after resync period without resync needed")
fakeClient.InjectError("SetConditions", fmt.Errorf("injected error"))
m.sync(context.Background())
m.sync()
assert.False(t, m.needResync(), "Should not resync before resync period")
fakeClock.Step(resyncPeriod)
@ -130,7 +129,7 @@ func TestHeartbeat(t *testing.T) {
m, fakeClient, fakeClock := newTestManager()
condition := newTestCondition("TestCondition")
m.conditions = map[string]types.Condition{condition.Type: condition}
m.sync(context.Background())
m.sync()
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")

View File

@ -17,16 +17,15 @@ limitations under the License.
package k8sexporter
import (
"context"
"net"
"net/http"
"net/http/pprof"
_ "net/http/pprof"
"strconv"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/utils/clock"
"k8s.io/node-problem-detector/cmd/options"
"k8s.io/node-problem-detector/pkg/exporters/k8sexporter/condition"
@ -38,8 +37,6 @@ import (
type k8sExporter struct {
client problemclient.Client
conditionManager condition.ConditionManager
writeEvents bool
updateConditions bool
}
// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting,
@ -47,41 +44,35 @@ type k8sExporter struct {
//
// Note that this function may be blocked (until a timeout occurs) before
// kube-apiserver becomes ready.
func NewExporterOrDie(ctx context.Context, npdo *options.NodeProblemDetectorOptions) types.Exporter {
func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
if !npdo.EnableK8sExporter {
return nil
}
c := problemclient.NewClientOrDie(npdo)
klog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
if err := waitForAPIServerReadyWithTimeout(ctx, c, npdo); err != nil {
klog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil {
glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
}
ke := k8sExporter{
client: c,
conditionManager: condition.NewConditionManager(c, clock.RealClock{}, npdo.K8sExporterHeartbeatPeriod),
writeEvents: npdo.K8sExporterWriteEvents,
updateConditions: npdo.K8sExporterUpdateNodeConditions,
}
ke.startHTTPReporting(npdo)
ke.conditionManager.Start(ctx)
ke.conditionManager.Start()
return &ke
}
func (ke *k8sExporter) ExportProblems(status *types.Status) {
if ke.writeEvents {
for _, event := range status.Events {
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
}
for _, event := range status.Events {
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
}
if ke.updateConditions {
for _, cdt := range status.Conditions {
ke.conditionManager.UpdateCondition(cdt)
}
for _, cdt := range status.Conditions {
ke.conditionManager.UpdateCondition(cdt)
}
}
@ -103,30 +94,22 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio
util.ReturnHTTPJson(w, ke.conditionManager.GetConditions())
})
// register pprof
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
addr := net.JoinHostPort(npdo.ServerAddress, strconv.Itoa(npdo.ServerPort))
go func() {
err := http.ListenAndServe(addr, mux)
if err != nil {
klog.Fatalf("Failed to start server: %v", err)
glog.Fatalf("Failed to start server: %v", err)
}
}()
}
func waitForAPIServerReadyWithTimeout(ctx context.Context, c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
return wait.PollUntilContextTimeout(ctx, npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, true, func(ctx context.Context) (done bool, err error) {
func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) {
// If NPD can get the node object from kube-apiserver, the server is
// ready and the RBAC permission is set correctly.
if _, err := c.GetNode(ctx); err != nil {
klog.Errorf("Can't get node object: %v", err)
return false, err
if _, err := c.GetNode(); err == nil {
return true, nil
}
return true, nil
return false, nil
})
}

View File

@ -17,7 +17,6 @@ limitations under the License.
package problemclient
import (
"context"
"fmt"
"reflect"
"sync"
@ -61,7 +60,7 @@ func (f *FakeProblemClient) AssertConditions(expected []v1.NodeCondition) error
}
// SetConditions is a fake mimic of SetConditions, it only update the internal condition cache.
func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.NodeCondition) error {
func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
f.Lock()
defer f.Unlock()
if err, ok := f.errors["SetConditions"]; ok {
@ -74,7 +73,7 @@ func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.N
}
// GetConditions is a fake mimic of GetConditions, it returns the conditions cached internally.
func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
f.Lock()
defer f.Unlock()
if err, ok := f.errors["GetConditions"]; ok {
@ -94,6 +93,6 @@ func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeCo
func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) {
}
func (f *FakeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
func (f *FakeProblemClient) GetNode() (*v1.Node, error) {
return nil, fmt.Errorf("GetNode() not implemented")
}

View File

@ -17,24 +17,24 @@ limitations under the License.
package problemclient
import (
"context"
"encoding/json"
"fmt"
"net/url"
"os"
"path/filepath"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/kubernetes/pkg/api/legacyscheme"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/clock"
clientset "k8s.io/client-go/kubernetes"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
"github.com/golang/glog"
"k8s.io/heapster/common/kubernetes"
"k8s.io/node-problem-detector/cmd/options"
"k8s.io/node-problem-detector/pkg/version"
)
@ -42,14 +42,14 @@ import (
// Client is the interface of problem client
type Client interface {
// GetConditions get all specific conditions of current node.
GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
// SetConditions set or update conditions of current node.
SetConditions(ctx context.Context, conditionTypes []v1.NodeCondition) error
SetConditions(conditions []v1.NodeCondition) error
// Eventf reports the event.
Eventf(eventType string, source, reason, messageFmt string, args ...interface{})
// GetNode returns the Node object of the node on which the
// node-problem-detector runs.
GetNode(ctx context.Context) (*v1.Node, error)
GetNode() (*v1.Node, error)
}
type nodeProblemClient struct {
@ -68,14 +68,13 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
// we have checked it is a valid URI after command line argument is parsed.:)
uri, _ := url.Parse(npdo.ApiServerOverride)
cfg, err := getKubeClientConfig(uri)
cfg, err := kubernetes.GetKubeClientConfig(uri)
if err != nil {
panic(err)
}
cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version())
cfg.QPS = npdo.QPS
cfg.Burst = npdo.Burst
// TODO(random-liu): Set QPS Limit
c.client = clientset.NewForConfigOrDie(cfg).CoreV1()
c.nodeName = npdo.NodeName
c.eventNamespace = npdo.EventNamespace
@ -84,8 +83,8 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
return c
}
func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
node, err := c.GetNode(ctx)
func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
node, err := c.GetNode()
if err != nil {
return nil, err
}
@ -100,7 +99,7 @@ func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []
return conditions, nil
}
func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v1.NodeCondition) error {
func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) error {
for i := range newConditions {
// Each time we update the conditions, we update the heart beat time
newConditions[i].LastHeartbeatTime = metav1.NewTime(c.clock.Now())
@ -109,15 +108,7 @@ func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v
if err != nil {
return err
}
return retry.OnError(retry.DefaultRetry,
func(error) bool {
return true
},
func() error {
_, err := c.client.Nodes().PatchStatus(ctx, c.nodeName, patch)
return err
},
)
return c.client.RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(c.nodeName).SubResource("status").Body(patch).Do().Error()
}
func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, args ...interface{}) {
@ -130,10 +121,8 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string,
recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...)
}
func (c *nodeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
// To reduce the load on APIServer & etcd, we are serving GET operations from
// apiserver cache (the data might be slightly delayed).
return c.client.Nodes().Get(ctx, c.nodeName, metav1.GetOptions{ResourceVersion: "0"})
func (c *nodeProblemClient) GetNode() (*v1.Node, error) {
return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{})
}
// generatePatch generates condition patch
@ -148,8 +137,8 @@ func generatePatch(conditions []v1.NodeCondition) ([]byte, error) {
// getEventRecorder generates a recorder for specific node name and source.
func getEventRecorder(c typedcorev1.CoreV1Interface, namespace, nodeName, source string) record.EventRecorder {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(klog.V(4).Infof)
recorder := eventBroadcaster.NewRecorder(runtime.NewScheme(), v1.EventSource{Component: source, Host: nodeName})
eventBroadcaster.StartLogging(glog.V(4).Infof)
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: source, Host: nodeName})
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events(namespace)})
return recorder
}

View File

@ -22,10 +22,10 @@ import (
"testing"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/client-go/tools/record"
testclock "k8s.io/utils/clock/testing"
"github.com/stretchr/testify/assert"
)
@ -40,7 +40,7 @@ func newFakeProblemClient() *nodeProblemClient {
nodeName: testNode,
// There is no proper fake for *client.Client for now
// TODO(random-liu): Add test for SetConditions when we have good fake for *client.Client
clock: testclock.NewFakeClock(time.Now()),
clock: &clock.FakeClock{},
recorders: make(map[string]record.EventRecorder),
nodeRef: getNodeRef("", testNode),
}

View File

@ -22,8 +22,8 @@ import (
"strconv"
"contrib.go.opencensus.io/exporter/prometheus"
"github.com/golang/glog"
"go.opencensus.io/stats/view"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/options"
"k8s.io/node-problem-detector/pkg/types"
@ -40,13 +40,13 @@ func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
addr := net.JoinHostPort(npdo.PrometheusServerAddress, strconv.Itoa(npdo.PrometheusServerPort))
pe, err := prometheus.NewExporter(prometheus.Options{})
if err != nil {
klog.Fatalf("Failed to create Prometheus exporter: %v", err)
glog.Fatalf("Failed to create Prometheus exporter: %v", err)
}
go func() {
mux := http.NewServeMux()
mux.Handle("/metrics", pe)
if err := http.ListenAndServe(addr, mux); err != nil {
klog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
glog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
}
}()
view.RegisterExporter(pe)

View File

@ -18,7 +18,7 @@ package gce
import (
"cloud.google.com/go/compute/metadata"
"k8s.io/klog/v2"
"github.com/golang/glog"
)
type Metadata struct {
@ -37,7 +37,7 @@ func (md *Metadata) HasMissingField() bool {
func (md *Metadata) PopulateFromGCE() error {
var err error
klog.Info("Fetching GCE metadata from metadata server")
glog.Info("Fetching GCE metadata from metadata server")
if md.ProjectID == "" {
md.ProjectID, err = metadata.ProjectID()
if err != nil {

View File

@ -18,19 +18,19 @@ package stackdriverexporter
import (
"encoding/json"
"os"
"io/ioutil"
"path/filepath"
"reflect"
"time"
"contrib.go.opencensus.io/exporter/stackdriver"
monitoredres "contrib.go.opencensus.io/exporter/stackdriver/monitoredresource"
"github.com/golang/glog"
"github.com/spf13/pflag"
"go.opencensus.io/stats/view"
"google.golang.org/api/option"
"k8s.io/klog/v2"
"github.com/avast/retry-go/v4"
"github.com/avast/retry-go"
"k8s.io/node-problem-detector/pkg/exporters"
seconfig "k8s.io/node-problem-detector/pkg/exporters/stackdriver/config"
"k8s.io/node-problem-detector/pkg/types"
@ -54,7 +54,6 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m",
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
metrics.DiskPercentUsedID: "compute.googleapis.com/guest/disk/percent_used",
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
@ -67,7 +66,6 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
metrics.MemoryPercentUsedID: "compute.googleapis.com/guest/memory/percent_used",
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
@ -139,12 +137,12 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
DefaultMonitoringLabels: &globalLabels,
})
if err != nil {
klog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
glog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
}
exportPeriod, err := time.ParseDuration(se.config.ExportPeriod)
if err != nil {
klog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
glog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
}
view.SetReportingPeriod(exportPeriod)
@ -153,33 +151,33 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
func (se *stackdriverExporter) populateMetadataOrDie() {
if !se.config.GCEMetadata.HasMissingField() {
klog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
glog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
return
}
metadataFetchTimeout, err := time.ParseDuration(se.config.MetadataFetchTimeout)
if err != nil {
klog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
glog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
}
metadataFetchInterval, err := time.ParseDuration(se.config.MetadataFetchInterval)
if err != nil {
klog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
glog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
}
klog.Infof("Populating GCE metadata by querying GCE metadata server.")
glog.Infof("Populating GCE metadata by querying GCE metadata server.")
err = retry.Do(se.config.GCEMetadata.PopulateFromGCE,
retry.Delay(metadataFetchInterval),
retry.Attempts(uint(metadataFetchTimeout/metadataFetchInterval)),
retry.DelayType(retry.FixedDelay))
if err == nil {
klog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
glog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
return
}
if se.config.PanicOnMetadataFetchFailure {
klog.Fatalf("Failed to populate GCE metadata: %v", err)
glog.Fatalf("Failed to populate GCE metadata: %v", err)
} else {
klog.Errorf("Failed to populate GCE metadata: %v", err)
glog.Errorf("Failed to populate GCE metadata: %v", err)
}
}
@ -202,7 +200,7 @@ func (clo *commandLineOptions) SetFlags(fs *pflag.FlagSet) {
func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
options, ok := clo.(*commandLineOptions)
if !ok {
klog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
glog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
}
if options.configPath == "" {
return nil
@ -211,17 +209,17 @@ func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
se := stackdriverExporter{}
// Apply configurations.
f, err := os.ReadFile(options.configPath)
f, err := ioutil.ReadFile(options.configPath)
if err != nil {
klog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
glog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
}
err = json.Unmarshal(f, &se.config)
if err != nil {
klog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
glog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
}
se.config.ApplyConfiguration()
klog.Infof("Starting Stackdriver exporter %s", options.configPath)
glog.Infof("Starting Stackdriver exporter %s", options.configPath)
se.populateMetadataOrDie()
se.setupOpenCensusViewExporterOrDie()

View File

@ -1,4 +1,3 @@
//go:build !disable_stackdriver_exporter
// +build !disable_stackdriver_exporter
/*

View File

@ -17,13 +17,9 @@ limitations under the License.
package healthchecker
import (
"context"
"net/http"
"os/exec"
"strings"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/healthchecker/types"
)
@ -40,7 +36,6 @@ type healthChecker struct {
crictlPath string
healthCheckTimeout time.Duration
coolDownTime time.Duration
loopBackTime time.Duration
logPatternsToCheck map[string]int
}
@ -53,7 +48,6 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
healthCheckTimeout: hco.HealthCheckTimeout,
coolDownTime: hco.CoolDownTime,
service: hco.Service,
loopBackTime: hco.LoopBackTime,
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
}
hc.healthCheckFunc = getHealthCheckFunc(hco)
@ -69,26 +63,24 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
if err != nil {
return healthy, err
}
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.loopBackTime, hc.logPatternsToCheck)
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
if err != nil {
return logPatternHealthy, err
}
if healthy && logPatternHealthy {
return true, nil
}
// The service is unhealthy.
// Attempt repair based on flag.
if hc.enableRepair {
// repair if the service has been up for the cool down period.
uptime, err := hc.uptimeFunc()
if err != nil {
klog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
return false, nil
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
}
klog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
if uptime > hc.coolDownTime {
klog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
hc.repairFunc()
}
}
@ -97,21 +89,18 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatternsToCheck map[string]int) (bool, error) {
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
if len(logPatternsToCheck) == 0 {
return true, nil
}
uptimeFunc := getUptimeFunc(service)
klog.Infof("Getting uptime for service: %v\n", service)
uptime, err := uptimeFunc()
if err != nil {
klog.Warningf("Failed to get the uptime: %+v", err)
return true, err
}
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
if loopBackTime > 0 && uptime > loopBackTime {
logStartTime = time.Now().Add(-loopBackTime).Format(types.LogParsingTimeLayout)
if err != nil {
return true, err
}
for pattern, count := range logPatternsToCheck {
healthy, err := checkForPattern(service, logStartTime, pattern, count)
@ -121,65 +110,3 @@ func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatter
}
return true, nil
}
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
return func() (bool, error) {
httpClient := http.Client{Timeout: timeout}
response, err := httpClient.Get(endpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false, nil
}
return true, nil
}
}
// getHealthCheckFunc returns the health check function based on the component.
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint(), hco.HealthCheckTimeout)
case types.KubeProxyComponent:
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint(), hco.HealthCheckTimeout)
case types.DockerComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, getDockerPath(), "ps"); err != nil {
return false, nil
}
return true, nil
}
case types.CRIComponent:
return func() (bool, error) {
_, err := execCommand(
hco.HealthCheckTimeout,
hco.CriCtlPath,
"--timeout="+hco.CriTimeout.String(),
"--runtime-endpoint="+hco.CriSocketPath,
"pods",
"--latest",
)
if err != nil {
return false, nil
}
return true, nil
}
default:
klog.Warningf("Unsupported component: %v", hco.Component)
}
return nil
}
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, command, args...)
out, err := cmd.CombinedOutput()
if err != nil {
klog.Infof("command %v failed: %v, %s\n", cmd, err, string(out))
return "", err
}
return strings.TrimSuffix(string(out), "\n"), nil
}

View File

@ -1,49 +0,0 @@
/*
Copyright 2023 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package healthchecker
import (
"runtime"
"time"
"k8s.io/klog/v2"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
)
// getUptimeFunc returns the time for which the given service has been running.
func getUptimeFunc(service string) func() (time.Duration, error) {
klog.Fatalf("getUptimeFunc is not supported in %s", runtime.GOOS)
return func() (time.Duration, error) { return time.Second, nil }
}
// getRepairFunc returns the repair function based on the component.
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
klog.Fatalf("getRepairFunc is not supported in %s", runtime.GOOS)
return func() {}
}
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
// service restart. (false, nil) otherwise.
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
klog.Fatalf("checkForPattern is not supported in %s", runtime.GOOS)
return false, nil
}
func getDockerPath() string {
klog.Fatalf("getDockerPath is not supported in %s", runtime.GOOS)
return ""
}

View File

@ -17,12 +17,15 @@ limitations under the License.
package healthchecker
import (
"context"
"errors"
"net/http"
"os/exec"
"strconv"
"strings"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/healthchecker/types"
@ -56,11 +59,6 @@ func getUptimeFunc(service string) func() (time.Duration, error) {
// getRepairFunc returns the repair function based on the component.
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
// Use `systemctl kill` instead of `systemctl restart` for the repair function.
// We start to rely on the kernel message difference for the two commands to
// indicate if the component restart is due to an administrative plan (restart)
// or a system issue that needs repair (kill).
// See https://github.com/kubernetes/node-problem-detector/issues/847.
switch hco.Component {
case types.DockerComponent:
// Use "docker ps" for docker health check. Not using crictl for docker to remove
@ -77,6 +75,49 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
}
}
// getHealthCheckFunc returns the health check function based on the component.
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return func() (bool, error) {
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false, nil
}
return true, nil
}
case types.DockerComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
return false, nil
}
return true, nil
}
case types.CRIComponent:
return func() (bool, error) {
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
return false, nil
}
return true, nil
}
}
return nil
}
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, command, args...)
out, err := cmd.Output()
if err != nil {
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
return "", err
}
return strings.TrimSuffix(string(out), "\n"), nil
}
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
// service restart. (false, nil) otherwise.
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
@ -95,12 +136,8 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
return true, err
}
if occurrences >= logCountThreshold {
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
return false, nil
}
return true, nil
}
func getDockerPath() string {
return "docker"
}

View File

@ -20,7 +20,6 @@ import (
"testing"
"time"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/healthchecker/types"
)
@ -120,38 +119,3 @@ func TestHealthCheck(t *testing.T) {
})
}
}
func TestComponentsSupported(t *testing.T) {
for _, tc := range []struct {
description string
component string
}{
{
description: "Kube Proxy should be supported",
component: types.KubeProxyComponent,
},
{
description: "Kubelet should be supported",
component: types.KubeletComponent,
},
{
description: "Docker should be supported",
component: types.DockerComponent,
},
{
description: "CRI should be supported",
component: types.CRIComponent,
},
} {
t.Run(tc.description, func(t *testing.T) {
checkFunc := getHealthCheckFunc(&options.HealthCheckerOptions{
Component: tc.component,
})
if checkFunc == nil {
t.Errorf("component %v should be supported", tc.component)
}
})
}
}

View File

@ -18,12 +18,13 @@ package healthchecker
import (
"fmt"
"net/http"
"os/exec"
"strconv"
"strings"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/healthchecker/types"
@ -33,19 +34,12 @@ import (
// getUptimeFunc returns the time for which the given service has been running.
func getUptimeFunc(service string) func() (time.Duration, error) {
return func() (time.Duration, error) {
// To attempt to calculate uptime more efficiently, we attempt to grab the process id to grab the start time.
// If the process id does not exist (meaning the service is not running for some reason), we will result to
// using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
// In addition to filtering not by the logname=system we also filter on event id=7036 to reduce the number of
// entries the next command Where-Object will have to look through. id 7036 messages indicating a stopped or running service.
// Using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
// The powershell command formats the TimeCreated of the event log in RFC1123Pattern.
// However, because the time library parser does not recognize the ',' in this RFC1123Pattern format,
// it is manually removed before parsing it using the UptimeTimeLayout.
getTimeCreatedCmd := `$ProcessId = (Get-WMIObject -Class Win32_Service -Filter "Name='` + service + `'" | Select-Object -ExpandProperty ProcessId);` +
`if ([string]::IsNullOrEmpty($ProcessId) -or $ProcessId -eq 0) { (Get-WinEvent -FilterHashtable @{logname='system';id=7036} ` +
`| Where-Object {$_.Message -match '.*(` + service + `).*(running).*'} | Select-Object -Property TimeCreated -First 1 | ` +
`foreach {$_.TimeCreated.ToUniversalTime().ToString('R')} | Out-String).Trim() } else { (Get-Process -Id $ProcessId | Select starttime | ` +
`foreach {$_.starttime.ToUniversalTime().ToString('R')} | Out-String).Trim() }`
getTimeCreatedCmd := "(Get-WinEvent -Logname System | Where-Object {$_.Message -Match '.*(" + service +
").*(running).*'} | Select-Object -Property TimeCreated -First 1 | foreach {$_.TimeCreated.ToString('R')} | Out-String).Trim()"
out, err := powershell(getTimeCreatedCmd)
if err != nil {
return time.Duration(0), err
@ -70,6 +64,49 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
}
}
// getHealthCheckFunc returns the health check function based on the component.
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
switch hco.Component {
case types.KubeletComponent:
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint, hco.HealthCheckTimeout)
case types.KubeProxyComponent:
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint, hco.HealthCheckTimeout)
case types.DockerComponent:
return func() (bool, error) {
if _, err := execCommand("docker.exe", "ps"); err != nil {
return false, nil
}
return true, nil
}
case types.CRIComponent:
return func() (bool, error) {
if _, err := execCommand(hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
return false, nil
}
return true, nil
}
}
return nil
}
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
return func() (bool, error) {
httpClient := http.Client{Timeout: timeout}
response, err := httpClient.Get(endpoint)
if err != nil || response.StatusCode != http.StatusOK {
return false, nil
}
return true, nil
}
}
// execCommand creates a new process, executes the command, and returns the (output, error) from command.
func execCommand(command string, args ...string) (string, error) {
cmd := util.Exec(command, args...)
return extractCommandOutput(cmd)
}
// powershell executes the arguments in powershell process and returns (output, error) from command.
func powershell(args ...string) (string, error) {
cmd := util.Powershell(args...)
@ -80,7 +117,7 @@ func powershell(args ...string) (string, error) {
func extractCommandOutput(cmd *exec.Cmd) (string, error) {
out, err := cmd.Output()
if err != nil {
klog.Infof("command %v failed: %v, %v\n", cmd, err, out)
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
return "", err
}
return strings.TrimSuffix(string(out), "\r\n"), nil
@ -101,12 +138,8 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
return true, err
}
if occurrences >= logCountThreshold {
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
return false, nil
}
return true, nil
}
func getDockerPath() string {
return "docker.exe"
}

View File

@ -18,8 +18,6 @@ package types
import (
"fmt"
"net"
"os"
"sort"
"strconv"
"strings"
@ -27,8 +25,6 @@ import (
)
const (
DefaultLoopBackTime = 0 * time.Minute
DefaultCriTimeout = 2 * time.Second
DefaultCoolDownTime = 2 * time.Minute
DefaultHealthCheckTimeout = 10 * time.Second
CmdTimeout = 10 * time.Second
@ -40,57 +36,12 @@ const (
ContainerdService = "containerd"
KubeProxyComponent = "kube-proxy"
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
KubeProxyHealthCheckEndpoint = "http://127.0.0.1:10256/healthz"
LogPatternFlagSeparator = ":"
hostAddressKey = "HOST_ADDRESS"
kubeletPortKey = "KUBELET_PORT"
kubeProxyPortKey = "KUBEPROXY_PORT"
defaultHostAddress = "localhost"
defaultKubeletPort = "10248"
defaultKubeproxyPort = "10256"
)
var (
kubeletHealthCheckEndpoint string
kubeProxyHealthCheckEndpoint string
)
func init() {
setKubeEndpoints()
}
func setKubeEndpoints() {
var o string
hostAddress := defaultHostAddress
kubeletPort := defaultKubeletPort
kubeProxyPort := defaultKubeproxyPort
o = os.Getenv(hostAddressKey)
if o != "" {
hostAddress = o
}
o = os.Getenv(kubeletPortKey)
if o != "" {
kubeletPort = o
}
o = os.Getenv(kubeProxyPortKey)
if o != "" {
kubeProxyPort = o
}
kubeletHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeletPort))
kubeProxyHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeProxyPort))
}
func KubeProxyHealthCheckEndpoint() string {
return kubeProxyHealthCheckEndpoint
}
func KubeletHealthCheckEndpoint() string {
return kubeletHealthCheckEndpoint
}
type HealthChecker interface {
CheckHealth() (bool, error)
}

View File

@ -0,0 +1,23 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package types
const (
DefaultCriCtl = "/usr/bin/crictl"
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
)

View File

@ -98,101 +98,3 @@ func TestLogPatternFlag(t *testing.T) {
})
}
}
func TestKubeEndpointConfiguration(t *testing.T) {
testCases := []struct {
name string
envConfig map[string]string
expectedKubeletEndpoint string
expectedKubeProxyEndpoint string
}{
{
name: "no overrides supplied",
envConfig: map[string]string{},
expectedKubeletEndpoint: "http://localhost:10248/healthz",
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
},
{
name: "HOST_ADDRESS override supplied",
envConfig: map[string]string{
"HOST_ADDRESS": "samplehost.testdomain.com",
},
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
},
{
name: "HOST_ADDRESS override supplied with IPv4",
envConfig: map[string]string{
"HOST_ADDRESS": "10.0.5.4",
},
expectedKubeletEndpoint: "http://10.0.5.4:10248/healthz",
expectedKubeProxyEndpoint: "http://10.0.5.4:10256/healthz",
},
{
name: "HOST_ADDRESS override supplied with IPv6",
envConfig: map[string]string{
"HOST_ADDRESS": "80:f4:16::1",
},
expectedKubeletEndpoint: "http://[80:f4:16::1]:10248/healthz",
expectedKubeProxyEndpoint: "http://[80:f4:16::1]:10256/healthz",
},
{
name: "KUBELET_PORT override supplied",
envConfig: map[string]string{
"KUBELET_PORT": "12345",
},
expectedKubeletEndpoint: "http://localhost:12345/healthz",
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
},
{
name: "KUBEPROXY_PORT override supplied",
envConfig: map[string]string{
"KUBEPROXY_PORT": "12345",
},
expectedKubeletEndpoint: "http://localhost:10248/healthz",
expectedKubeProxyEndpoint: "http://localhost:12345/healthz",
},
{
name: "HOST_ADDRESS and KUBELET_PORT override supplied",
envConfig: map[string]string{
"HOST_ADDRESS": "samplehost.testdomain.com",
"KUBELET_PORT": "12345",
},
expectedKubeletEndpoint: "http://samplehost.testdomain.com:12345/healthz",
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
},
{
name: "HOST_ADDRESS and KUBEPROXY_PORT override supplied",
envConfig: map[string]string{
"HOST_ADDRESS": "samplehost.testdomain.com",
"KUBEPROXY_PORT": "12345",
},
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:12345/healthz",
},
{
name: "HOST_ADDRESS, KUBELET_PORT and KUBEPROXY_PORT override supplied",
envConfig: map[string]string{
"HOST_ADDRESS": "10.0.10.1",
"KUBELET_PORT": "12345",
"KUBEPROXY_PORT": "12346",
},
expectedKubeletEndpoint: "http://10.0.10.1:12345/healthz",
expectedKubeProxyEndpoint: "http://10.0.10.1:12346/healthz",
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
for key, val := range test.envConfig {
t.Setenv(key, val)
}
setKubeEndpoints()
kubeProxyHCEndpoint := KubeProxyHealthCheckEndpoint()
kubeletHCEndpoint := KubeletHealthCheckEndpoint()
assert.Equal(t, test.expectedKubeProxyEndpoint, kubeProxyHCEndpoint)
assert.Equal(t, test.expectedKubeletEndpoint, kubeletHCEndpoint)
})
}
}

View File

@ -1,25 +0,0 @@
//go:build unix
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package types
const (
DefaultCriCtl = "/usr/bin/crictl"
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
)

View File

@ -17,7 +17,7 @@ limitations under the License.
package types
const (
DefaultCriCtl = "C:/etc/kubernetes/node/bin/crictl.exe"
DefaultCriCtl = "C:/node/crictl.exe"
DefaultCriSocketPath = "npipe:////./pipe/containerd-containerd"
UptimeTimeLayout = "Mon 02 Jan 2006 15:04:05 MST"
LogParsingTimeFormat = "yyyy-MM-dd HH:mm:ss"

View File

@ -1,4 +1,3 @@
//go:build journald
// +build journald
/*
@ -23,7 +22,7 @@ import (
"fmt"
"time"
"k8s.io/utils/clock"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/node-problem-detector/cmd/logcounter/options"
"k8s.io/node-problem-detector/pkg/logcounter/types"
@ -40,11 +39,10 @@ const (
)
type logCounter struct {
logCh <-chan *systemtypes.Log
buffer systemlogmonitor.LogBuffer
pattern string
revertPattern string
clock clock.Clock
logCh <-chan *systemtypes.Log
buffer systemlogmonitor.LogBuffer
pattern string
clock clock.Clock
}
func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
@ -60,11 +58,10 @@ func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter
return nil, fmt.Errorf("error watching journald: %v", err)
}
return &logCounter{
logCh: logCh,
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
pattern: options.Pattern,
revertPattern: options.RevertPattern,
clock: clock.RealClock{},
logCh: logCh,
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
pattern: options.Pattern,
clock: clock.RealClock{},
}, nil
}
@ -86,9 +83,6 @@ func (e *logCounter) Count() (count int, err error) {
if len(e.buffer.Match(e.pattern)) != 0 {
count++
}
if e.revertPattern != "" && len(e.buffer.Match(e.revertPattern)) != 0 {
count--
}
case <-e.clock.After(timeout):
// Don't block forever if we do not get any new messages
return

View File

@ -1,4 +1,3 @@
//go:build journald
// +build journald
/*
@ -23,16 +22,16 @@ import (
"testing"
"time"
testclock "k8s.io/utils/clock/testing"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/node-problem-detector/pkg/logcounter/types"
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
)
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *testclock.FakeClock, chan *systemtypes.Log) {
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *clock.FakeClock, chan *systemtypes.Log) {
logCh := make(chan *systemtypes.Log)
clock := testclock.NewFakeClock(startTime)
clock := clock.NewFakeClock(startTime)
return &logCounter{
logCh: logCh,
buffer: systemlogmonitor.NewLogBuffer(bufferSize),

View File

@ -19,7 +19,7 @@ package problemdaemon
import (
"fmt"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/types"
)
@ -58,7 +58,7 @@ func NewProblemDaemons(monitorConfigPaths types.ProblemDaemonConfigPathMap) []ty
for _, config := range *configs {
if _, ok := problemDaemonMap[config]; ok {
// Skip the config if it's duplicated.
klog.Warningf("Duplicated problem daemon configuration %q", config)
glog.Warningf("Duplicated problem daemon configuration %q", config)
continue
}
problemDaemonMap[config] = handlers[problemDaemonType].CreateProblemDaemonOrDie(config)

View File

@ -17,17 +17,16 @@ limitations under the License.
package problemdetector
import (
"context"
"fmt"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/types"
)
// ProblemDetector collects statuses from all problem daemons and update the node condition and send node event.
type ProblemDetector interface {
Run(context.Context) error
Run(termCh <-chan error) error
}
type problemDetector struct {
@ -45,7 +44,7 @@ func NewProblemDetector(monitors []types.Monitor, exporters []types.Exporter) Pr
}
// Run starts the problem detector.
func (p *problemDetector) Run(ctx context.Context) error {
func (p *problemDetector) Run(termCh <-chan error) error {
// Start the log monitors one by one.
var chans []<-chan *types.Status
failureCount := 0
@ -53,7 +52,7 @@ func (p *problemDetector) Run(ctx context.Context) error {
ch, err := m.Start()
if err != nil {
// Do not return error and keep on trying the following config files.
klog.Errorf("Failed to start problem daemon %v: %v", m, err)
glog.Errorf("Failed to start problem daemon %v: %v", m, err)
failureCount++
continue
}
@ -74,11 +73,11 @@ func (p *problemDetector) Run(ctx context.Context) error {
}()
ch := groupChannel(chans)
klog.Info("Problem detector started")
glog.Info("Problem detector started")
for {
select {
case <-ctx.Done():
case <-termCh:
return nil
case status := <-ch:
for _, exporter := range p.exporters {

View File

@ -17,7 +17,6 @@ limitations under the License.
package problemdetector
import (
"context"
"testing"
"k8s.io/node-problem-detector/pkg/types"
@ -25,7 +24,7 @@ import (
func TestEmpty(t *testing.T) {
pd := NewProblemDetector([]types.Monitor{}, []types.Exporter{})
if err := pd.Run(context.Background()); err == nil {
if err := pd.Run(nil); err == nil {
t.Error("expected error when running an empty problem detector")
}
}

View File

@ -21,7 +21,7 @@ import (
"fmt"
"sync"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/util/metrics"
)
@ -56,7 +56,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
metrics.Sum,
[]string{"reason"})
if err != nil {
klog.Fatalf("Failed to create problem_counter metric: %v", err)
glog.Fatalf("Failed to create problem_counter metric: %v", err)
}
pmm.problemGauge, err = metrics.NewInt64Metric(
@ -67,7 +67,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
metrics.LastValue,
[]string{"type", "reason"})
if err != nil {
klog.Fatalf("Failed to create problem_gauge metric: %v", err)
glog.Fatalf("Failed to create problem_gauge metric: %v", err)
}
pmm.problemTypeToReason = make(map[string]string)

View File

@ -37,8 +37,7 @@ with new rule definition:
"type": "temporary/permanent",
"condition": "NodeConditionOfPermanentIssue",
"reason": "CamelCaseShortReason",
"pattern": "regexp matching the issue in the log",
"patternGeneratedMessageSuffix": "Please check the network connectivity and ensure that all required services are running. For more details, see our documentation at https://example.com/docs/troubleshooting."
"message": "regexp matching the issue in the log"
}
```

View File

@ -46,7 +46,7 @@ type MonitorConfig struct {
EnableMetricsReporting *bool `json:"metricsReporting,omitempty"`
}
// ApplyDefaultConfiguration applies default configurations.
// ApplyConfiguration applies default configurations.
func (mc *MonitorConfig) ApplyDefaultConfiguration() {
if mc.BufferSize == 0 {
mc.BufferSize = defaultBufferSize

View File

@ -18,16 +18,16 @@ package systemlogmonitor
import (
"encoding/json"
"fmt"
"os"
"io/ioutil"
"time"
"k8s.io/klog/v2"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/problemdaemon"
"k8s.io/node-problem-detector/pkg/problemmetrics"
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers"
watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
"k8s.io/node-problem-detector/pkg/types"
"k8s.io/node-problem-detector/pkg/util"
@ -50,7 +50,7 @@ type logMonitor struct {
buffer LogBuffer
config MonitorConfig
conditions []types.Condition
logCh <-chan *systemlogtypes.Log
logCh <-chan *logtypes.Log
output chan *types.Status
tomb *tomb.Tomb
}
@ -62,21 +62,21 @@ func NewLogMonitorOrDie(configPath string) types.Monitor {
tomb: tomb.NewTomb(),
}
f, err := os.ReadFile(configPath)
f, err := ioutil.ReadFile(configPath)
if err != nil {
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
}
err = json.Unmarshal(f, &l.config)
if err != nil {
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
}
// Apply default configurations
(&l.config).ApplyDefaultConfiguration()
err = l.config.ValidateRules()
if err != nil {
klog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
glog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
}
klog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
glog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
l.watcher = logwatchers.GetLogWatcherOrDie(l.config.WatcherConfig)
l.buffer = NewLogBuffer(l.config.BufferSize)
@ -96,19 +96,19 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) {
if rule.Type == types.Perm {
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
if err != nil {
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
rule.Condition, rule.Reason, err)
}
}
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
if err != nil {
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
}
}
}
func (l *logMonitor) Start() (<-chan *types.Status, error) {
klog.Infof("Start log monitor %s", l.configPath)
glog.Infof("Start log monitor %s", l.configPath)
var err error
l.logCh, err = l.watcher.Watch()
if err != nil {
@ -119,7 +119,7 @@ func (l *logMonitor) Start() (<-chan *types.Status, error) {
}
func (l *logMonitor) Stop() {
klog.Infof("Stop log monitor %s", l.configPath)
glog.Infof("Stop log monitor %s", l.configPath)
l.tomb.Stop()
}
@ -134,20 +134,20 @@ func (l *logMonitor) monitorLoop() {
select {
case log, ok := <-l.logCh:
if !ok {
klog.Errorf("Log channel closed: %s", l.configPath)
glog.Errorf("Log channel closed: %s", l.configPath)
return
}
l.parseLog(log)
case <-l.tomb.Stopping():
l.watcher.Stop()
klog.Infof("Log monitor stopped: %s", l.configPath)
glog.Infof("Log monitor stopped: %s", l.configPath)
return
}
}
}
// parseLog parses one log line.
func (l *logMonitor) parseLog(log *systemlogtypes.Log) {
func (l *logMonitor) parseLog(log *logtypes.Log) {
// Once there is new log, log monitor will push it into the log buffer and try
// to match each rule. If any rule is matched, log monitor will report a status.
l.buffer.Push(log)
@ -157,16 +157,16 @@ func (l *logMonitor) parseLog(log *systemlogtypes.Log) {
continue
}
status := l.generateStatus(matched, rule)
klog.Infof("New status generated: %+v", status)
glog.Infof("New status generated: %+v", status)
l.output <- status
}
}
// generateStatus generates status from the logs.
func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogtypes.Rule) *types.Status {
func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Rule) *types.Status {
// We use the timestamp of the first log line as the timestamp of the status.
timestamp := logs[0].Timestamp
message := generateMessage(logs, rule.PatternGeneratedMessageSuffix)
message := generateMessage(logs)
var events []types.Event
var changedConditions []*types.Condition
if rule.Type == types.Temp {
@ -192,7 +192,6 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
condition.Type,
types.True,
rule.Reason,
message,
timestamp,
))
}
@ -208,14 +207,14 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
for _, event := range events {
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(event.Reason, 1)
if err != nil {
klog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
glog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
}
}
for _, condition := range changedConditions {
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
condition.Type, condition.Reason, condition.Status == types.True)
if err != nil {
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
condition.Type, condition.Reason, err)
}
}
@ -233,7 +232,7 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty
func (l *logMonitor) initializeStatus() {
// Initialize the default node conditions
l.conditions = initialConditions(l.config.DefaultConditions)
klog.Infof("Initialize condition generated: %+v", l.conditions)
glog.Infof("Initialize condition generated: %+v", l.conditions)
// Update the initial status
l.output <- &types.Status{
Source: l.config.Source,
@ -251,14 +250,10 @@ func initialConditions(defaults []types.Condition) []types.Condition {
return conditions
}
func generateMessage(logs []*systemlogtypes.Log, patternGeneratedMessageSuffix string) string {
func generateMessage(logs []*logtypes.Log) string {
messages := []string{}
for _, log := range logs {
messages = append(messages, log.Message)
}
logMessage := concatLogs(messages)
if patternGeneratedMessageSuffix != "" {
return fmt.Sprintf("%s; %s", logMessage, patternGeneratedMessageSuffix)
}
return logMessage
return concatLogs(messages)
}

View File

@ -26,7 +26,6 @@ import (
"k8s.io/node-problem-detector/pkg/problemdaemon"
"k8s.io/node-problem-detector/pkg/problemmetrics"
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
"k8s.io/node-problem-detector/pkg/types"
"k8s.io/node-problem-detector/pkg/util"
"k8s.io/node-problem-detector/pkg/util/metrics"
@ -85,7 +84,6 @@ func TestGenerateStatusForConditions(t *testing.T) {
testConditionA,
types.True,
"test reason",
"test message 1\ntest message 2",
time.Unix(1000, 1000),
)},
Conditions: []types.Condition{
@ -700,40 +698,3 @@ func TestInitializeProblemMetricsOrDie(t *testing.T) {
})
}
}
func TestGenerateMessage(t *testing.T) {
tests := []struct {
name string
logs []*systemlogtypes.Log
patternGeneratedMessageSuffix string
want string
}{
{
name: "No rule message",
logs: []*systemlogtypes.Log{
{Message: "First log message"},
{Message: "Second log message"},
},
patternGeneratedMessageSuffix: "",
want: "First log message\nSecond log message",
},
{
name: "With rule message",
logs: []*systemlogtypes.Log{
{Message: "First log message"},
{Message: "Second log message"},
},
patternGeneratedMessageSuffix: "refer www.foo.com/docs for playbook on how to fix the issue",
want: "First log message\nSecond log message; refer www.foo.com/docs for playbook on how to fix the issue",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := generateMessage(tt.logs, tt.patternGeneratedMessageSuffix)
if got != tt.want {
t.Errorf("generateMessage() = %v, want %v", got, tt.want)
}
})
}
}

View File

@ -23,7 +23,8 @@ import (
"strings"
"time"
"k8s.io/klog/v2"
utilclock "code.cloudfoundry.org/clock"
"github.com/golang/glog"
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
@ -39,6 +40,7 @@ type filelogWatcher struct {
logCh chan *logtypes.Log
startTime time.Time
tomb *tomb.Tomb
clock utilclock.Clock
}
// NewSyslogWatcherOrDie creates a new log watcher. The function panics
@ -46,11 +48,11 @@ type filelogWatcher struct {
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
uptime, err := util.GetUptimeDuration()
if err != nil {
klog.Fatalf("failed to get uptime: %v", err)
glog.Fatalf("failed to get uptime: %v", err)
}
startTime, err := util.GetStartTime(time.Now(), uptime, cfg.Lookback, cfg.Delay)
if err != nil {
klog.Fatalf("failed to get start time: %v", err)
glog.Fatalf("failed to get start time: %v", err)
}
return &filelogWatcher{
@ -60,6 +62,7 @@ func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
tomb: tomb.NewTomb(),
// A capacity 1000 buffer should be enough
logCh: make(chan *logtypes.Log, 1000),
clock: utilclock.NewClock(),
}
}
@ -74,7 +77,7 @@ func (s *filelogWatcher) Watch() (<-chan *logtypes.Log, error) {
}
s.reader = bufio.NewReader(r)
s.closer = r
klog.Info("Start watching filelog")
glog.Info("Start watching filelog")
go s.watchLoop()
return s.logCh, nil
}
@ -99,14 +102,14 @@ func (s *filelogWatcher) watchLoop() {
for {
select {
case <-s.tomb.Stopping():
klog.Infof("Stop watching filelog")
glog.Infof("Stop watching filelog")
return
default:
}
line, err := s.reader.ReadString('\n')
if err != nil && err != io.EOF {
klog.Errorf("Exiting filelog watch with error: %v", err)
glog.Errorf("Exiting filelog watch with error: %v", err)
return
}
buffer.WriteString(line)
@ -116,28 +119,16 @@ func (s *filelogWatcher) watchLoop() {
}
line = buffer.String()
buffer.Reset()
if s.filterSkipList(line) {
continue
}
log, err := s.translator.translate(strings.TrimSuffix(line, "\n"))
if err != nil {
klog.Warningf("Unable to parse line: %q, %v", line, err)
glog.Warningf("Unable to parse line: %q, %v", line, err)
continue
}
// Discard messages before start time.
if log.Timestamp.Before(s.startTime) {
klog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
glog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
continue
}
s.logCh <- log
}
}
func (s *filelogWatcher) filterSkipList(line string) bool {
for _ , skipItem := range s.cfg.SkipList {
if strings.Contains(line, skipItem) {
return true
}
}
return false
}

View File

@ -1,29 +0,0 @@
/*
Copyright 2023 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package filelog
import (
"io"
"github.com/hpcloud/tail"
)
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back
// to the rolled out logs.
func getLogReader(path string) (io.ReadCloser, error) {
return tail.OpenFile(path)
}

View File

@ -19,8 +19,9 @@ package filelog
import (
"fmt"
"io"
"k8s.io/node-problem-detector/third_party/forked/cadvisor/tail"
"os"
"github.com/google/cadvisor/utils/tail"
)
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back

View File

@ -17,6 +17,7 @@ limitations under the License.
package filelog
import (
"io/ioutil"
"os"
"testing"
"time"
@ -25,8 +26,8 @@ import (
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
"k8s.io/node-problem-detector/pkg/util"
"code.cloudfoundry.org/clock/fakeclock"
"github.com/stretchr/testify/assert"
testclock "k8s.io/utils/clock/testing"
)
// getTestPluginConfig returns a plugin config for test. Use configuration for
@ -42,7 +43,7 @@ func getTestPluginConfig() map[string]string {
func TestWatch(t *testing.T) {
// now is a fake time
now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local)
fakeClock := testclock.NewFakeClock(now)
fakeClock := fakeclock.NewFakeClock(now)
testCases := []struct {
uptime time.Duration
lookback string
@ -138,7 +139,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
}
for c, test := range testCases {
t.Logf("TestCase #%d: %#v", c+1, test)
f, err := os.CreateTemp("", "log_watcher_test")
f, err := ioutil.TempFile("", "log_watcher_test")
assert.NoError(t, err)
defer func() {
f.Close()
@ -155,6 +156,8 @@ Jan 2 03:04:05 kernel: [2.000000] 3
})
// Set the startTime.
w.(*filelogWatcher).startTime, _ = util.GetStartTime(fakeClock.Now(), test.uptime, test.lookback, test.delay)
// Set the fake clock.
w.(*filelogWatcher).clock = fakeClock
logCh, err := w.Watch()
assert.NoError(t, err)
defer w.Stop()
@ -167,7 +170,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
}
}
// The log channel should have already been drained
// There could still be future messages sent into the channel, but the chance is really slim.
// There could stil be future messages sent into the channel, but the chance is really slim.
timeout := time.After(100 * time.Millisecond)
select {
case log := <-logCh:
@ -176,36 +179,3 @@ Jan 2 03:04:05 kernel: [2.000000] 3
}
}
}
func TestFilterSkipList(t *testing.T) {
s := &filelogWatcher{
cfg: types.WatcherConfig{
SkipList: []string{
" audit:", " kubelet:",
},
},
}
testcase := []struct{
log string
expect bool
}{
{
log: `Jan 2 03:04:03 kernel: [0.000000] 1`,
expect: false,
},
{
log: `Jan 2 03:04:04 audit: [1.000000] 2`,
expect: true,
},
{
log: `Jan 2 03:04:05 kubelet: [2.000000] 3`,
expect: true,
},
}
for i, test := range testcase {
if s.filterSkipList(test.log) != test.expect {
t.Errorf("test case %d: expect %v but got %v", i, test.expect, s.filterSkipList(test.log))
}
}
}

View File

@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
@ -22,7 +22,7 @@ import (
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
"k8s.io/klog/v2"
"github.com/golang/glog"
)
// translator translates log line into internal log type based on user defined
@ -46,7 +46,7 @@ const (
func newTranslatorOrDie(pluginConfig map[string]string) *translator {
if err := validatePluginConfig(pluginConfig); err != nil {
klog.Errorf("Failed to validate plugin configuration %+v: %v", pluginConfig, err)
glog.Errorf("Failed to validate plugin configuration %+v: %v", pluginConfig, err)
}
return &translator{
timestampRegexp: regexp.MustCompile(pluginConfig[timestampKey]),

Some files were not shown because too many files have changed in this diff Show More