Compare commits
506 Commits
Author | SHA1 | Date |
---|---|---|
|
dea6d70d46 | |
|
9d69c8e71a | |
|
6bbddb55de | |
|
cd3b7503bb | |
|
9b473a0e56 | |
|
a765aaecf7 | |
|
9e366f58cd | |
|
fc10031a7e | |
|
ca907dc101 | |
|
62223078ef | |
|
9fe113c522 | |
|
dc065c42f0 | |
|
8d4eb38a42 | |
|
0147098968 | |
|
308b7cfa4a | |
|
1721f9dbf7 | |
|
3f96666db7 | |
|
387571b357 | |
|
aede9d7e7f | |
|
0f1ee66855 | |
|
be0d387ec1 | |
|
78f51bf173 | |
|
87129900cf | |
|
59c46ad62c | |
|
4022575bf9 | |
|
f6bb4f7b55 | |
|
5562632053 | |
|
d6cfed982a | |
|
01e1cf033e | |
|
a099a5ed5c | |
|
5520e3df51 | |
|
c53e4f4308 | |
|
2a651d1f98 | |
|
e8b584ab52 | |
|
e858c3d1df | |
|
3bb752c25a | |
|
1ff64afbc9 | |
|
f69e7033e9 | |
|
f24ca57199 | |
|
32d7c72755 | |
|
2707945338 | |
|
c846b0ebaa | |
|
7039f066c7 | |
|
7ea55106c2 | |
|
3b92e70bc1 | |
|
4b9d196acd | |
|
cf267168c2 | |
|
416ec8b3c2 | |
|
7cb27449aa | |
|
186f0182b5 | |
|
628f021ffb | |
|
92597e574d | |
|
72f3041d2b | |
|
8d237a6c7c | |
|
c5f6fbc3d1 | |
|
12a8f5578c | |
|
d2cbde95e5 | |
|
66336e630a | |
|
93bc55b659 | |
|
72f1e1de7b | |
|
0a997f8116 | |
|
053539efd8 | |
|
cf0870fa12 | |
|
334a857fbe | |
|
f5433f460d | |
|
93e64ac709 | |
|
146ce4aa86 | |
|
d99fca5f0a | |
|
17d7588bff | |
|
26c77134bf | |
|
7d29a1c293 | |
|
3a8a07ad81 | |
|
cab30567cb | |
|
53f404dfed | |
|
b92aae803d | |
|
e8840b1a7d | |
|
29a98372ff | |
|
daaa07d690 | |
|
411cd7bd82 | |
|
d79c681e63 | |
|
45dde88c98 | |
|
b5ce184179 | |
|
2e15606dda | |
|
711760063a | |
|
2a3acd2669 | |
|
04173ee934 | |
|
f675d34e49 | |
|
de55c54059 | |
|
0d756b78fc | |
|
8b2ff03f5e | |
|
f4f5c479d9 | |
|
c1dd00d65c | |
|
68a97cf4cb | |
|
6c32180ce6 | |
|
7416db2236 | |
|
17dcc94418 | |
|
70e99e1e1f | |
|
53e0152f64 | |
|
16656c89f6 | |
|
0f4d8b96c5 | |
|
3a386a659e | |
|
b64f13f702 | |
|
2182ad0ddb | |
|
335e7e82ca | |
|
f392516a37 | |
|
c39f74def4 | |
|
be754653e6 | |
|
f0c5cd5d20 | |
|
ee955f9170 | |
|
3b91ca0c09 | |
|
13c44d92fd | |
|
56eb3dcb61 | |
|
e1d071ba63 | |
|
a22c0649f8 | |
|
57c97d2d47 | |
|
d83e1bcb53 | |
|
8f5c2e14fe | |
|
f1c1759ca0 | |
|
798610a11a | |
|
66fbb738fd | |
|
ae6fa3560e | |
|
35ffe05910 | |
|
ceee726210 | |
|
13a06ccad9 | |
|
68d08ac953 | |
|
dc4200d805 | |
|
a88792f4bd | |
|
e4fd02e9f1 | |
|
3e1bf74cda | |
|
a8973a8664 | |
|
3c43a0bd10 | |
|
9ed6527f0a | |
|
9694ee4354 | |
|
d6d4d93e4e | |
|
490faeace5 | |
|
f692ac3136 | |
|
c8659fb914 | |
|
200d46726c | |
|
3173ed132e | |
|
a22fe2a52f | |
|
c7befef47e | |
|
5f99c4d9b8 | |
|
ba355ee23f | |
|
ac9382a5c1 | |
|
c123dddac8 | |
|
daf4f4da3e | |
|
be9ba585dd | |
|
09c3cfe7ad | |
|
16921fe90f | |
|
289f11b28f | |
|
612199f0c6 | |
|
71a4f7a631 | |
|
1fbfdfd4f7 | |
|
5efc8884d1 | |
|
c0bccb7c76 | |
|
369020d878 | |
|
34fd4f8a8d | |
|
f0308d29b4 | |
|
4e0b9150b9 | |
|
34e60f82ec | |
|
7a48ce2e38 | |
|
69da591e38 | |
|
6c34d837ef | |
|
ecdccfb86c | |
|
132ccc8e81 | |
|
86750df7c2 | |
|
19c6f4db70 | |
|
e4ecee1976 | |
|
0dde605376 | |
|
09bbaa9c32 | |
|
f004190ea1 | |
|
7ee2a4dcda | |
|
f39c93e0f4 | |
|
8c22b69431 | |
|
030599e642 | |
|
66f9e5187f | |
|
5f59f438ac | |
|
0b89667d18 | |
|
338430f835 | |
|
a45f174cfc | |
|
273c3f5266 | |
|
b4623de861 | |
|
7d81d8e12a | |
|
da09edb63c | |
|
e4f8f268e8 | |
|
ecf4224d46 | |
|
0dd173c51f | |
|
2813b15c58 | |
|
0f60f182e8 | |
|
aed88103f1 | |
|
13b65d06e9 | |
|
098d5ba360 | |
|
ea591f5ac3 | |
|
d5346f245c | |
|
8dac51c9e7 | |
|
775a138ad6 | |
|
6c34d567d4 | |
|
4c92bd54a2 | |
|
a1bc4f865d | |
|
a78ccb3612 | |
|
1626b85f13 | |
|
9aa45e0cee | |
|
7ed9c90baf | |
|
e37dcfc3ff | |
|
c0e4778fc0 | |
|
fda3234b64 | |
|
d4aa574df2 | |
|
8cd92dbaba | |
|
325938f2d2 | |
|
10378c8b11 | |
|
629774d3ed | |
|
014cd7d6ac | |
|
bc72eff716 | |
|
ce1d2c5c53 | |
|
b48e438737 | |
|
e14c3e4ae5 | |
|
58211f19f7 | |
|
b193e6e392 | |
|
1667bae479 | |
|
953ca74ac9 | |
|
c2e0519a1f | |
|
c74bf4e01c | |
|
e8623bdba7 | |
|
e4d293eb51 | |
|
e14b3921e8 | |
|
b0ede7b09c | |
|
af3f5c5882 | |
|
9769baefb9 | |
|
855780c9c1 | |
|
74c95a2486 | |
|
31fe5c1534 | |
|
08b2255c33 | |
|
faa2923c51 | |
|
9444907a56 | |
|
7dd7c14868 | |
|
d1166d3495 | |
|
008a62bb90 | |
|
b6235fb72d | |
|
e1385935b8 | |
|
ef98b9612e | |
|
58017fd35e | |
|
d0e447d8e1 | |
|
b32c1c5bd4 | |
|
f24dbb13f7 | |
|
45c3445b2a | |
|
84eb1e338f | |
|
689a066c90 | |
|
5b031d63cc | |
|
ce82f2a81b | |
|
f262b500fd | |
|
c225435bea | |
|
1002df5e13 | |
|
18630b6c78 | |
|
0fba03ef7a | |
|
e9eddcc6d3 | |
|
3704fa72a9 | |
|
552b530e0b | |
|
30e04d41fa | |
|
bdaa44eb23 | |
|
9f639dd892 | |
|
e3c396e324 | |
|
73a120de57 | |
|
34b265af34 | |
|
d88694fbd1 | |
|
07900633cb | |
|
bf157f81f8 | |
|
e31cf7b137 | |
|
07b7a42624 | |
|
27dcab4ba5 | |
|
aec1c74025 | |
|
a5aadf719a | |
|
698c8b067c | |
|
d04bb3a5b0 | |
|
b3653a0aff | |
|
95829b8991 | |
|
fdd522a951 | |
|
5326e106f0 | |
|
ed94dff2cd | |
|
65e4aa3c5e | |
|
fb498567b4 | |
|
76bf7b7e77 | |
|
5210373640 | |
|
e43459d86d | |
|
eeab0ab06f | |
|
be3b1ad382 | |
|
0d276ac19f | |
|
e2ef1de56a | |
|
d4a00d4f20 | |
|
188340e3e9 | |
|
e56fb7de12 | |
|
2bb82faa7b | |
|
79ffff83cb | |
|
e9922b0da7 | |
|
d8e9d550dc | |
|
8283e091cd | |
|
1bcf025f67 | |
|
574b25418f | |
|
9ad24ea2c7 | |
|
f58f6cd208 | |
|
af7c925522 | |
|
c4311bd207 | |
|
ba1e0b3146 | |
|
8b33e32e3d | |
|
568fbe8437 | |
|
2077606ba3 | |
|
9ff6b0bde4 | |
|
469ba765fd | |
|
adbe770d74 | |
|
4ce2aca621 | |
|
e0fa1d2898 | |
|
608e129d8f | |
|
cae2cad3a5 | |
|
594c1b6583 | |
|
c3c53894c3 | |
|
c9da164ae6 | |
|
c3a3774cf1 | |
|
c9edf4072e | |
|
a8f7a9f270 | |
|
09b7fb8814 | |
|
5953ba1261 | |
|
ed99195ed6 | |
|
db83d7fe0b | |
|
471ab88240 | |
|
1bf525de79 | |
|
c2b2b0b3df | |
|
e14abd4ea5 | |
|
55586431bd | |
|
967fe3fbc7 | |
|
5fd18a117f | |
|
fd51f17ec1 | |
|
d605f87d6d | |
|
1ccff37f96 | |
|
d573b5d00f | |
|
6b538a5d4e | |
|
e6fbdd434a | |
|
6e30b17476 | |
|
da422bb452 | |
|
e992542b57 | |
|
6dc23ca804 | |
|
339e243472 | |
|
b5e4ef628b | |
|
c27b4beb6d | |
|
f116c9264c | |
|
75095b2573 | |
|
af2226183f | |
|
8ec3f36293 | |
|
d4aeca09f5 | |
|
b610240ce3 | |
|
aec734d822 | |
|
7fc7947bc3 | |
|
343e0f226c | |
|
9fd58e318f | |
|
7cc8ec6315 | |
|
a50e83a5c3 | |
|
c658f9717b | |
|
22157af0e5 | |
|
d229082e26 | |
|
4906ebb182 | |
|
a7adf55137 | |
|
6e57ca6e6c | |
|
948f634d8f | |
|
00fc95a16a | |
|
a5fd95c982 | |
|
6dbe19abbd | |
|
e8b55acc2b | |
|
706bf35086 | |
|
2e0ff3d14c | |
|
429777eb5d | |
|
07317328f1 | |
|
e6ab24db7f | |
|
d88e0dda02 | |
|
a117c0c056 | |
|
83e520784b | |
|
92e63b5991 | |
|
ff4af1b398 | |
|
e98f0c09ba | |
|
f601956af9 | |
|
2415e30efe | |
|
6163859ae8 | |
|
b586bd9231 | |
|
7b6805491c | |
|
8578b779e2 | |
|
a83ef25930 | |
|
2bf62c0180 | |
|
005e4e0259 | |
|
0b34230dd5 | |
|
95056202c6 | |
|
d77d8f2992 | |
|
0dc032e76f | |
|
ed3111fec1 | |
|
49fbd5cf4b | |
|
d1c8a8bfe2 | |
|
80fc2c206e | |
|
7f0a62683e | |
|
df6320d147 | |
|
0842910049 | |
|
de33c801a5 | |
|
cc6c049522 | |
|
0afa7cc6ff | |
|
169ff4f9fe | |
|
2f959a773c | |
|
7bd6e85b29 | |
|
0127a75e05 | |
|
b6d8069610 | |
|
6de3fabc9f | |
|
9b2d0be950 | |
|
5c85ab20f5 | |
|
2fceddf00e | |
|
92745daa62 | |
|
d8b2940b3c | |
|
5560df8cba | |
|
8f9c5bbabb | |
|
d00659c642 | |
|
3fba7a9e86 | |
|
6e3260c43c | |
|
9a9b06d24d | |
|
7bc362cfdc | |
|
341af62275 | |
|
2d5de8d0fa | |
|
1c9447854f | |
|
c9ffa67ec4 | |
|
6809f445eb | |
|
04e8d009d4 | |
|
72f1672634 | |
|
72ad051dd6 | |
|
1471f74d98 | |
|
56122ce0dd | |
|
b1bd8e7424 | |
|
a39a7c6e0f | |
|
011b9e6a46 | |
|
9344c938bb | |
|
51508603fe | |
|
c083db10f0 | |
|
a0abe5c667 | |
|
9c23553e0b | |
|
8603b5b98b | |
|
285516dc10 | |
|
11ddb5e6bf | |
|
78c11c4ceb | |
|
5e300846b2 | |
|
d764b1ab87 | |
|
4412a2b9a4 | |
|
f508ccea7b | |
|
68314853b8 | |
|
20c3b6f13c | |
|
bdbf6b3df9 | |
|
84259052d1 | |
|
c370cfb68a | |
|
bc89bbce56 | |
|
363d01392a | |
|
0b64594d0a | |
|
3e9834e26d | |
|
e7fe0b20dc | |
|
0761e11cc4 | |
|
a859b5f027 | |
|
36dc9081ef | |
|
760d252808 | |
|
0de6fae1f8 | |
|
e7d28a3bf1 | |
|
3344efd552 | |
|
56c592a5d7 | |
|
1123fd22cb | |
|
393a9401b1 | |
|
fd6c80b840 | |
|
3c3609b5fa | |
|
7a33650863 | |
|
a276a05765 | |
|
dd0d0d71ab | |
|
26f070bfd4 | |
|
ca95d61bf8 | |
|
f1aa82a9ae | |
|
aa5c7ec00d | |
|
203116b614 | |
|
383be3edec | |
|
50ba775915 | |
|
68bf26b08f | |
|
fe09e416bd | |
|
f9199e56c5 | |
|
870ce7ce75 | |
|
7c5e1385cf | |
|
d8ce535dc3 | |
|
49526abf27 | |
|
62a5f8888e | |
|
ebdd9038b7 | |
|
c8629cea5d | |
|
70f79831de | |
|
cbb029d905 | |
|
a0b0f9460f | |
|
220f0b00f1 | |
|
e349323507 | |
|
93badb28ac | |
|
d52844ae67 | |
|
ed97725ea1 | |
|
fae6181a54 | |
|
a14577dfa4 | |
|
7b51a90328 | |
|
94d8373a9e | |
|
1150ce519f | |
|
c266c431f5 | |
|
9ce0dbfbd0 | |
|
84a54c5447 | |
|
bd2a900a37 | |
|
f27c3a8da9 | |
|
caf2bad7b6 | |
|
b409875246 | |
|
a3b928467e |
|
@ -0,0 +1,38 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: github-actions
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
groups:
|
||||
actions-all:
|
||||
patterns:
|
||||
- "*"
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: docker
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: gomod
|
||||
directories:
|
||||
- /
|
||||
- /test
|
||||
schedule:
|
||||
interval: weekly
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types:
|
||||
- "version-update:semver-major"
|
||||
- "version-update:semver-minor"
|
||||
groups:
|
||||
k8s:
|
||||
patterns:
|
||||
- "k8s.io/*"
|
||||
- "sigs.k8s.io/*"
|
||||
labels:
|
||||
- "ok-to-test"
|
|
@ -0,0 +1,78 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: ["master"]
|
||||
schedule:
|
||||
- cron: "0 0 * * 1"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: ["go"]
|
||||
# CodeQL supports [ $supported-codeql-languages ]
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
|
@ -0,0 +1,27 @@
|
|||
# Dependency Review Action
|
||||
#
|
||||
# This Action will scan dependency manifest files that change as part of a Pull Request,
|
||||
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
|
||||
# Once installed, if the workflow run is marked as required,
|
||||
# PRs introducing known-vulnerable packages will be blocked from merging.
|
||||
#
|
||||
# Source repository: https://github.com/actions/dependency-review-action
|
||||
name: 'Dependency Review'
|
||||
on: [pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
dependency-review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: 'Checkout Repository'
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: 'Dependency Review'
|
||||
uses: actions/dependency-review-action@da24556b548a50705dd671f47852072ea4c105d9 # v4.7.1
|
|
@ -0,0 +1,76 @@
|
|||
# This workflow uses actions that are not certified by GitHub. They are provided
|
||||
# by a third-party and are governed by separate terms of service, privacy
|
||||
# policy, and support documentation.
|
||||
|
||||
name: Scorecard supply-chain security
|
||||
on:
|
||||
# For Branch-Protection check. Only the default branch is supported. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
|
||||
branch_protection_rule:
|
||||
# To guarantee Maintained check is occasionally updated. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
|
||||
schedule:
|
||||
- cron: '20 7 * * 2'
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecard analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results and get a badge (see publish_results below).
|
||||
id-token: write
|
||||
contents: read
|
||||
actions: read
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
|
||||
# - you want to enable the Branch-Protection check on a *public* repository, or
|
||||
# - you are installing Scorecards on a *private* repository
|
||||
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
|
||||
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
|
||||
|
||||
# Public repositories:
|
||||
# - Publish results to OpenSSF REST API for easy access by consumers
|
||||
# - Allows the repository to include the Scorecard badge.
|
||||
# - See https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories:
|
||||
# - `publish_results` will always be set to `false`, regardless
|
||||
# of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -0,0 +1,33 @@
|
|||
name: tag-release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- version.txt
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
if: ${{ github.repository == 'kubernetes/node-problem-detector' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- run: /usr/bin/git config --global user.email actions@github.com
|
||||
- run: /usr/bin/git config --global user.name 'GitHub Actions Release Tagger'
|
||||
- run: hack/tag-release.sh
|
||||
id: tag_release
|
||||
outputs:
|
||||
release_tag: ${{ steps.tag_release.outputs.release_tag }}
|
|
@ -6,3 +6,5 @@ pr.env
|
|||
junit*.xml
|
||||
debug.test
|
||||
/output/
|
||||
coverage.out
|
||||
.idea/
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
repos:
|
||||
- repo: https://github.com/gitleaks/gitleaks
|
||||
rev: v8.16.3
|
||||
hooks:
|
||||
- id: gitleaks
|
||||
- repo: https://github.com/golangci/golangci-lint
|
||||
rev: v1.52.2
|
||||
hooks:
|
||||
- id: golangci-lint
|
||||
- repo: https://github.com/jumanjihouse/pre-commit-hooks
|
||||
rev: 3.0.0
|
||||
hooks:
|
||||
- id: shellcheck
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
33
.travis.yml
33
.travis.yml
|
@ -1,33 +0,0 @@
|
|||
os:
|
||||
- linux
|
||||
sudo: required
|
||||
dist: xenial
|
||||
language: go
|
||||
go:
|
||||
- "1.16"
|
||||
- master
|
||||
env:
|
||||
- GO111MODULE=on
|
||||
services:
|
||||
- docker
|
||||
before_install:
|
||||
- sudo apt-get -qq update
|
||||
- sudo apt-get install -y libsystemd-dev
|
||||
install:
|
||||
- mkdir -p $HOME/gopath/src/k8s.io
|
||||
- mv $TRAVIS_BUILD_DIR $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
- cd $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
script:
|
||||
- make
|
||||
- make test
|
||||
- make clean && BUILD_TAGS="disable_custom_plugin_monitor" make
|
||||
- BUILD_TAGS="disable_custom_plugin_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_log_monitor" make
|
||||
- BUILD_TAGS="disable_system_log_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_stats_monitor" make
|
||||
- BUILD_TAGS="disable_system_stats_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_stackdriver_exporter" make
|
||||
- BUILD_TAGS="disable_stackdriver_exporter" make test
|
||||
- make clean && ENABLE_JOURNALD=0 make
|
||||
- ENABLE_JOURNALD=0 make test
|
||||
- ENABLE_JOURNALD=0 make build-binaries
|
10
CHANGELOG.md
10
CHANGELOG.md
|
@ -29,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
- Windows build now supported.
|
||||
- Added metrics to retrieve stats such as `procs_running` and `procs_blocked`.
|
||||
- Added metrics to retrieve network stats.
|
||||
- Added metric to retrieve guest OS features such as unknwon modules, ktd,
|
||||
- Added metric to retrieve guest OS features such as unknown modules, ktd,
|
||||
and kernel integrity.
|
||||
|
||||
### Changed
|
||||
|
@ -158,7 +158,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
- Empty LogPath will now use journald's default path.
|
||||
- Systemd monitor now looks back 5 minutes.
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:1.0.0`.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:1.0.0`.
|
||||
- Updated the detection method for docker overlay2 issues.
|
||||
- Moved NPD into the kube-system namespace.
|
||||
|
||||
|
@ -237,7 +237,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
- Added resource limites to NPD deployment.
|
||||
- Added log-counter to dockerfile.
|
||||
- Added `enable_message_change_based_condition_update` option to enable
|
||||
condition update when messages cahnge for custom plugin.
|
||||
condition update when messages change for custom plugin.
|
||||
|
||||
### Fixed
|
||||
|
||||
|
@ -248,7 +248,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
### Changed
|
||||
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.4.0`.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.4.0`.
|
||||
|
||||
## [0.6.0] - 2018-11-27
|
||||
|
||||
|
@ -277,7 +277,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
|||
|
||||
- Changed default port from 10256 to 20256 to avoid conflict with kube-proxy.
|
||||
- Bumped golang version from 1.8 to 1.9.
|
||||
- Bumped base image to `k8s.gcr.io/debian-base-amd64:0.3`.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.3`.
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ If your repo has certain guidelines for contribution, put them here ahead of the
|
|||
|
||||
- [Contributor License Agreement](https://git.k8s.io/community/CLA.md) Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests
|
||||
- [Kubernetes Contributor Guide](http://git.k8s.io/community/contributors/guide) - Main contributor documentation, or you can just jump directly to the [contributing section](http://git.k8s.io/community/contributors/guide#contributing)
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet.md) - Common resources for existing developers
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet/README.md) - Common resources for existing developers
|
||||
|
||||
## Mentorship
|
||||
|
||||
|
@ -28,4 +28,4 @@ Custom Information - if you're copying this template for the first time you can
|
|||
- [Slack channel](https://kubernetes.slack.com/messages/kubernetes-users) - Replace `kubernetes-users` with your slack channel string, this will send users directly to your channel.
|
||||
- [Mailing list](URL)
|
||||
|
||||
-->
|
||||
-->
|
||||
|
|
38
Dockerfile
38
Dockerfile
|
@ -12,20 +12,42 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
ARG BASEIMAGE
|
||||
FROM ${BASEIMAGE}
|
||||
# "builder-base" can be overriden using dockerb buildx's --build-context flag,
|
||||
# by users who want to use a different images for the builder. E.g. if you need to use an older OS
|
||||
# to avoid dependencies on very recent glibc versions.
|
||||
# E.g. of the param: --build-context builder-base=docker-image://golang:<something>@sha256:<something>
|
||||
# Must override builder-base, not builder, since the latter is referred to later in the file and so must not be
|
||||
# directly replaced. See here, and note that "stage" parameter mentioned there has been renamed to
|
||||
# "build-context": https://github.com/docker/buildx/pull/904#issuecomment-1005871838
|
||||
FROM golang:1.24-bookworm@sha256:00eccd446e023d3cd9566c25a6e6a02b90db3e1e0bbe26a48fc29cd96e800901 as builder-base
|
||||
FROM builder-base as builder
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
|
||||
MAINTAINER Random Liu <lantaol@google.com>
|
||||
ARG TARGETARCH
|
||||
|
||||
RUN clean-install util-linux libsystemd0 bash
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update --fix-missing && apt-get --yes install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
RUN go version
|
||||
|
||||
COPY . /gopath/src/k8s.io/node-problem-detector/
|
||||
WORKDIR /gopath/src/k8s.io/node-problem-detector
|
||||
RUN GOARCH=${TARGETARCH} make bin/node-problem-detector bin/health-checker bin/log-counter
|
||||
|
||||
FROM --platform=${TARGETPLATFORM} registry.k8s.io/build-image/debian-base:bookworm-v1.0.4@sha256:0a17678966f63e82e9c5e246d9e654836a33e13650a698adefede61bb5ca099e as base
|
||||
|
||||
LABEL maintainer="Random Liu <lantaol@google.com>"
|
||||
|
||||
RUN clean-install util-linux bash libsystemd-dev
|
||||
|
||||
# Avoid symlink of /etc/localtime.
|
||||
RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true
|
||||
|
||||
COPY ./bin/node-problem-detector /node-problem-detector
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/node-problem-detector /node-problem-detector
|
||||
|
||||
ARG LOGCOUNTER
|
||||
COPY ./bin/health-checker ${LOGCOUNTER} /home/kubernetes/bin/
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/health-checker /gopath/src/k8s.io/node-problem-detector/${LOGCOUNTER} /home/kubernetes/bin/
|
||||
|
||||
COPY config /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json"]
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/config/ /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json"]
|
||||
|
|
142
Makefile
142
Makefile
|
@ -17,12 +17,16 @@
|
|||
.PHONY: all \
|
||||
vet fmt version test e2e-test \
|
||||
build-binaries build-container build-tar build \
|
||||
docker-builder build-in-docker push-container push-tar push clean
|
||||
docker-builder build-in-docker \
|
||||
push-container push-tar push release clean depup \
|
||||
print-tar-sha-md5
|
||||
|
||||
all: build
|
||||
|
||||
# PLATFORMS is the set of OS_ARCH that NPD can build against.
|
||||
PLATFORMS=linux_amd64 windows_amd64
|
||||
LINUX_PLATFORMS=linux_amd64 linux_arm64
|
||||
DOCKER_PLATFORMS=linux/amd64,linux/arm64
|
||||
PLATFORMS=$(LINUX_PLATFORMS) windows_amd64
|
||||
|
||||
# VERSION is the version of the binary.
|
||||
VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else echo "UNKNOWN"; fi)
|
||||
|
@ -63,21 +67,24 @@ IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
|
|||
# support needs libsystemd-dev or libsystemd-journal-dev.
|
||||
ENABLE_JOURNALD?=1
|
||||
|
||||
ifeq ($(go env GOHOSTOS), darwin)
|
||||
ifeq ($(shell go env GOHOSTOS), darwin)
|
||||
ENABLE_JOURNALD=0
|
||||
else ifeq ($(go env GOHOSTOS), windows)
|
||||
else ifeq ($(shell go env GOHOSTOS), windows)
|
||||
ENABLE_JOURNALD=0
|
||||
endif
|
||||
|
||||
# TODO(random-liu): Support different architectures.
|
||||
# The debian-base:v1.0.0 image built from kubernetes repository is based on
|
||||
# Debian Stretch. It includes systemd 232 with support for both +XZ and +LZ4
|
||||
# compression. +LZ4 is needed on some os distros such as COS.
|
||||
BASEIMAGE:=k8s.gcr.io/debian-base-amd64:v1.0.0
|
||||
|
||||
# Disable cgo by default to make the binary statically linked.
|
||||
CGO_ENABLED:=0
|
||||
|
||||
ifeq ($(GOARCH), arm64)
|
||||
CC:=aarch64-linux-gnu-gcc
|
||||
else
|
||||
CC:=x86_64-linux-gnu-gcc
|
||||
endif
|
||||
|
||||
# Set default Go architecture to AMD64.
|
||||
GOARCH ?= amd64
|
||||
|
||||
# Construct the "-tags" parameter used by "go build".
|
||||
BUILD_TAGS?=
|
||||
|
||||
|
@ -101,15 +108,15 @@ ifeq ($(ENABLE_JOURNALD), 1)
|
|||
CGO_ENABLED:=1
|
||||
LOGCOUNTER=./bin/log-counter
|
||||
else
|
||||
# Hack: Don't copy over log-counter, use a wildcard path that shouldnt match
|
||||
# Hack: Don't copy over log-counter, use a wildcard path that shouldn't match
|
||||
# anything in COPY command.
|
||||
LOGCOUNTER=*dont-include-log-counter
|
||||
endif
|
||||
|
||||
vet:
|
||||
GO111MODULE=on go list -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
|
||||
go list -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
|
||||
grep -v "./vendor/*" | \
|
||||
GO111MODULE=on xargs go vet -mod vendor -tags "$(HOST_PLATFORM_BUILD_TAGS)"
|
||||
xargs go vet -tags "$(HOST_PLATFORM_BUILD_TAGS)"
|
||||
|
||||
fmt:
|
||||
find . -type f -name "*.go" | grep -v "./vendor/*" | xargs gofmt -s -w -l
|
||||
|
@ -123,12 +130,13 @@ ifeq ($(ENABLE_JOURNALD), 1)
|
|||
BINARIES_LINUX_ONLY += bin/log-counter
|
||||
endif
|
||||
|
||||
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/linux_amd64/$(binary)) $(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
|
||||
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) \
|
||||
$(foreach platform, $(LINUX_PLATFORMS), $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/$(platform)/$(binary))) \
|
||||
$(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
|
||||
ALL_TARBALLS = $(foreach platform, $(PLATFORMS), $(NPD_NAME_VERSION)-$(platform).tar.gz)
|
||||
|
||||
output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
|
@ -136,15 +144,15 @@ output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
|
|||
touch $@
|
||||
|
||||
output/windows_amd64/test/bin/%.exe: $(PKG_SOURCES)
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
cd test && \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
-o ../$@ \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
./test/e2e/$(subst -,,$*)
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_amd64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
|
@ -152,17 +160,34 @@ output/linux_amd64/bin/%: $(PKG_SOURCES)
|
|||
touch $@
|
||||
|
||||
output/linux_amd64/test/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o $@ \
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./test/e2e/$(subst -,,$*)
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_arm64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/$(subst -,,$*)
|
||||
touch $@
|
||||
|
||||
output/linux_arm64/test/bin/%: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
# In the future these targets should be deprecated.
|
||||
./bin/log-counter: $(PKG_SOURCES)
|
||||
ifeq ($(ENABLE_JOURNALD), 1)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/log-counter \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
|
@ -172,38 +197,37 @@ else
|
|||
endif
|
||||
|
||||
./bin/node-problem-detector: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/node-problem-detector \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/nodeproblemdetector
|
||||
|
||||
./test/bin/problem-maker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o test/bin/problem-maker \
|
||||
cd test && \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/problem-maker \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./test/e2e/problemmaker/problem_maker.go
|
||||
./e2e/problemmaker/problem_maker.go
|
||||
|
||||
./bin/health-checker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/health-checker \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
cmd/healthchecker/health_checker.go
|
||||
|
||||
test: vet fmt
|
||||
GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
|
||||
go test -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
|
||||
|
||||
e2e-test: vet fmt build-tar
|
||||
GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
|
||||
./test/e2e/metriconly/... -- \
|
||||
cd test && \
|
||||
go run github.com/onsi/ginkgo/ginkgo -nodes=$(PARALLEL) -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
|
||||
./e2e/metriconly/... -- \
|
||||
-project=$(PROJECT) -zone=$(ZONE) \
|
||||
-image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \
|
||||
-ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \
|
||||
-npd-build-tar=`pwd`/$(TARBALL) \
|
||||
-npd-build-tar=`pwd`/../$(TARBALL) \
|
||||
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
|
||||
-artifacts-dir=$(ARTIFACTS)
|
||||
|
||||
|
@ -216,8 +240,9 @@ $(NPD_NAME_VERSION)-%.tar.gz: $(ALL_BINARIES) test/e2e-install.sh
|
|||
|
||||
build-binaries: $(ALL_BINARIES)
|
||||
|
||||
build-container: build-binaries Dockerfile
|
||||
docker build -t $(IMAGE) --build-arg BASEIMAGE=$(BASEIMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
build-container: clean Dockerfile
|
||||
docker buildx create --platform $(DOCKER_PLATFORMS) --use
|
||||
docker buildx build --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
|
||||
$(TARBALL): ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
|
||||
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
|
||||
|
@ -229,7 +254,7 @@ build-tar: $(TARBALL) $(ALL_TARBALLS)
|
|||
build: build-container build-tar
|
||||
|
||||
docker-builder:
|
||||
docker build -t npd-builder ./builder
|
||||
docker build -t npd-builder . --target=builder
|
||||
|
||||
build-in-docker: clean docker-builder
|
||||
docker run \
|
||||
|
@ -237,17 +262,46 @@ build-in-docker: clean docker-builder
|
|||
-c 'cd /gopath/src/k8s.io/node-problem-detector/ && make build-binaries'
|
||||
|
||||
push-container: build-container
|
||||
# So we can push to docker hub by setting REGISTRY
|
||||
ifneq (,$(findstring gcr.io,$(REGISTRY)))
|
||||
gcloud auth configure-docker
|
||||
docker push $(IMAGE)
|
||||
endif
|
||||
# Build should be cached from build-container
|
||||
docker buildx build --push --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
|
||||
push-tar: build-tar
|
||||
gsutil cp $(TARBALL) $(UPLOAD_PATH)/node-problem-detector/
|
||||
gsutil cp node-problem-detector-$(VERSION)-*.tar.gz* $(UPLOAD_PATH)/node-problem-detector/
|
||||
|
||||
# `make push` is used by presubmit and CI jobs.
|
||||
push: push-container push-tar
|
||||
|
||||
# `make release` is used when releasing a new NPD version.
|
||||
release: push-container build-tar print-tar-sha-md5
|
||||
|
||||
print-tar-sha-md5: build-tar
|
||||
./hack/print-tar-sha-md5.sh $(VERSION)
|
||||
|
||||
coverage.out:
|
||||
rm -f coverage.out
|
||||
go test -coverprofile=coverage.out -timeout=1m -v -short ./...
|
||||
|
||||
clean:
|
||||
rm -rf bin/
|
||||
rm -rf test/bin/
|
||||
rm -f node-problem-detector-*.tar.gz*
|
||||
rm -rf output/
|
||||
rm -f coverage.out
|
||||
|
||||
.PHONY: gomod
|
||||
gomod:
|
||||
go mod tidy
|
||||
go mod vendor
|
||||
cd test; go mod tidy
|
||||
|
||||
.PHONY: goget
|
||||
goget:
|
||||
go get $(shell go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all)
|
||||
|
||||
.PHONY: depup
|
||||
depup: goget gomod
|
||||
|
|
10
OWNERS
10
OWNERS
|
@ -1,12 +1,14 @@
|
|||
reviewers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- sig-node-reviewers
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
||||
- mmiranda96
|
||||
- hakman
|
||||
approvers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- sig-node-approvers
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
|
@ -0,0 +1,19 @@
|
|||
aliases:
|
||||
sig-node-approvers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
||||
- SergeyKanzhelev
|
||||
- tallclair
|
||||
sig-node-reviewers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
88
README.md
88
README.md
|
@ -7,11 +7,11 @@ layers in the cluster management stack.
|
|||
It is a daemon that runs on each node, detects node
|
||||
problems and reports them to apiserver.
|
||||
node-problem-detector can either run as a
|
||||
[DaemonSet](http://kubernetes.io/docs/admin/daemons/) or run standalone.
|
||||
[DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) or run standalone.
|
||||
Now it is running as a
|
||||
[Kubernetes Addon](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
|
||||
enabled by default in the GCE cluster.
|
||||
|
||||
enabled by default in the GKE cluster. It is also enabled by default in AKS as part of the
|
||||
[AKS Linux Extension](https://learn.microsoft.com/en-us/azure/aks/faq#what-is-the-purpose-of-the-aks-linux-extension-i-see-installed-on-my-linux-vmss-instances).
|
||||
# Background
|
||||
|
||||
There are tons of node problems that could possibly affect the pods running on the
|
||||
|
@ -41,8 +41,8 @@ should be reported as `Event`.
|
|||
|
||||
# Problem Daemon
|
||||
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors a specific
|
||||
kind of node problems and reports them to node-problem-detector.
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors specific
|
||||
kinds of node problems and reports them to node-problem-detector.
|
||||
|
||||
A problem daemon could be:
|
||||
* A tiny daemon designed for dedicated Kubernetes use-cases.
|
||||
|
@ -62,9 +62,9 @@ List of supported problem daemons types:
|
|||
| Problem Daemon Types | NodeCondition | Description | Configs | Disabling Build Tag |
|
||||
|----------------|:---------------:|:------------|:--------|:--------------------|
|
||||
| [SystemLogMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemlogmonitor) | KernelDeadlock ReadonlyFilesystem FrequentKubeletRestart FrequentDockerRestart FrequentContainerdRestart | A system log monitor monitors system log and reports problems and metrics according to predefined rules. | [filelog](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-filelog.json), [kmsg](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json), [kernel](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-counter.json) [abrt](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) [systemd](https://github.com/kubernetes/node-problem-detector/blob/master/config/systemd-monitor-counter.json) | disable_system_log_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | | disable_system_stats_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | [system-stats-monitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | disable_system_stats_monitor
|
||||
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor) | On-demand(According to users configuration), existing example: NTPProblem | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user-defined check scripts. See the proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | [example](https://github.com/kubernetes/node-problem-detector/blob/4ad49bbd84b8ced45ac825eac01ec93d9235935e/config/custom-plugin-monitor.json) | disable_custom_plugin_monitor
|
||||
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) |
|
||||
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) [containerd](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-containerd.json) |
|
||||
|
||||
# Exporter
|
||||
|
||||
|
@ -102,9 +102,14 @@ certain backends. Some of them can be disabled at compile-time using a build tag
|
|||
|
||||
* `--config.custom-plugin-monitor`: List of paths to custom plugin monitor config files, comma-separated, e.g.
|
||||
[config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
use different custom plugin monitors to monitor different node problems.
|
||||
|
||||
|
||||
#### For Health Checkers
|
||||
|
||||
Health checkers are configured as custom plugins, using the config/health-checker-*.json config files.
|
||||
|
||||
#### For Kubernetes exporter
|
||||
|
||||
* `--enable-k8s-exporter`: Enables reporting to Kubernetes API server, default to `true`.
|
||||
|
@ -137,12 +142,12 @@ For example, to run without auth, use the following config:
|
|||
|
||||
## Build Image
|
||||
|
||||
* `go get` or `git clone` node-problem-detector repo into `$GOPATH/src/k8s.io` or `$GOROOT/src/k8s.io`
|
||||
with one of the below directions:
|
||||
* `cd $GOPATH/src/k8s.io && git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
* `cd $GOPATH/src/k8s.io && go get k8s.io/node-problem-detector`
|
||||
* Install development dependencies for `libsystemd` and the ARM GCC toolchain
|
||||
* Debian/Ubuntu: `apt install libsystemd-dev gcc-aarch64-linux-gnu`
|
||||
|
||||
* run `make` in the top directory. It will:
|
||||
* `git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
|
||||
* Run `make` in the top directory. It will:
|
||||
* Build the binary.
|
||||
* Build the docker image. The binary and `config/` are copied into the docker image.
|
||||
|
||||
|
@ -158,11 +163,6 @@ and [System Stats Monitor](https://github.com/kubernetes/node-problem-detector/t
|
|||
Check out the [Problem Daemon](https://github.com/kubernetes/node-problem-detector#problem-daemon) section
|
||||
to see how to disable each problem daemon during compilation time.
|
||||
|
||||
**Note**:
|
||||
By default, node-problem-detector will be built with systemd support with the `make` command. This requires systemd develop files.
|
||||
You should download the systemd develop files first. For Ubuntu, the `libsystemd-journal-dev` package should
|
||||
be installed. For Debian, the `libsystemd-dev` package should be installed.
|
||||
|
||||
## Push Image
|
||||
|
||||
`make push` uploads the docker image to a registry. By default, the image will be uploaded to
|
||||
|
@ -175,7 +175,7 @@ The easiest way to install node-problem-detector into your cluster is to use the
|
|||
|
||||
```
|
||||
helm repo add deliveryhero https://charts.deliveryhero.io/
|
||||
helm install deliveryhero/node-problem-detector
|
||||
helm install --generate-name deliveryhero/node-problem-detector
|
||||
```
|
||||
|
||||
Alternatively, to install node-problem-detector manually:
|
||||
|
@ -184,9 +184,13 @@ Alternatively, to install node-problem-detector manually:
|
|||
|
||||
2. Edit [node-problem-detector-config.yaml](deployment/node-problem-detector-config.yaml) to configure node-problem-detector.
|
||||
|
||||
3. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
3. Edit [rbac.yaml](deployment/rbac.yaml) to fit your environment.
|
||||
|
||||
3. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
4. Create the ServiceAccount and ClusterRoleBinding with `kubectl create -f rbac.yaml`.
|
||||
|
||||
4. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
|
||||
5. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
|
||||
## Start Standalone
|
||||
|
||||
|
@ -214,7 +218,7 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
|
|||
* [Go](https://golang.org/)
|
||||
* [Visual Studio Code](https://code.visualstudio.com/)
|
||||
* [Make](http://gnuwin32.sourceforge.net/packages/make.htm)
|
||||
* [mingw-64 WinBuilds](http://mingw-w64.org/doku.php/download/win-builds)
|
||||
* [mingw-64 WinBuilds](http://mingw-w64.org/downloads)
|
||||
* Tested with x86-64 Windows Native mode.
|
||||
* Add the `$InstallDir\bin` to [Windows `PATH` variable](https://answers.microsoft.com/en-us/windows/forum/windows_10-other_settings-winpc/adding-path-variable/97300613-20cb-4d85-8d0e-cc9d3549ba23).
|
||||
|
||||
|
@ -222,16 +226,16 @@ To develop NPD on Windows you'll need to setup your Windows machine for Go devel
|
|||
# Run these commands in the node-problem-detector directory.
|
||||
|
||||
# Build in MINGW64 Window
|
||||
make clean windows-binaries
|
||||
make clean ENABLE_JOURNALD=0 build-binaries
|
||||
|
||||
# Test in MINGW64 Window
|
||||
make test
|
||||
|
||||
# Run with containerd log monitoring enabled in Command Prompt. (Assumes containerd is installed.)
|
||||
%CD%\output\windows_amd64\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
|
||||
%CD%\output\windows_amd64\bin\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
|
||||
|
||||
# Configure NPD to run as a Windows Service
|
||||
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
|
||||
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
|
||||
sc.exe failure NodeProblemDetector reset= 0 actions= restart/10000
|
||||
sc.exe start NodeProblemDetector
|
||||
```
|
||||
|
@ -264,9 +268,9 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
|
|||
node-problem-detector uses [go modules](https://github.com/golang/go/wiki/Modules)
|
||||
to manage dependencies. Therefore, building node-problem-detector requires
|
||||
golang 1.11+. It still uses vendoring. See the
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-architecture/2019-03-19-go-modules.md#alternatives-to-vendoring-using-go-modules)
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-architecture/917-go-modules#alternatives-to-vendoring-using-go-modules)
|
||||
for the design decisions. To add a new dependency, update [go.mod](go.mod) and
|
||||
run `GO111MODULE=on go mod vendor`.
|
||||
run `go mod vendor`.
|
||||
|
||||
# Remedy Systems
|
||||
|
||||
|
@ -275,30 +279,26 @@ detected by the node-problem-detector. Remedy systems observe events and/or node
|
|||
conditions emitted by the node-problem-detector and take action to return the
|
||||
Kubernetes cluster to a healthy state. The following remedy systems exist:
|
||||
|
||||
* [**Draino**](https://github.com/planetlabs/draino) automatically drains Kubernetes
|
||||
nodes based on labels and node conditions. Nodes that match _all_ of the supplied
|
||||
labels and _any_ of the supplied node conditions will be prevented from accepting
|
||||
new pods (aka 'cordoned') immediately, and
|
||||
[drained](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
|
||||
after a configurable time. Draino can be used in conjunction with the
|
||||
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
to automatically terminate drained nodes. Refer to
|
||||
[this issue](https://github.com/kubernetes/node-problem-detector/issues/199)
|
||||
for an example production use case for Draino.
|
||||
* [**Descheduler**](https://github.com/kubernetes-sigs/descheduler) strategy RemovePodsViolatingNodeTaints
|
||||
evicts pods violating NoSchedule taints on nodes. The k8s scheduler's TaintNodesByCondition feature must
|
||||
be enabled. The [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
can be used to automatically terminate drained nodes.
|
||||
* [**mediK8S**](https://github.com/medik8s) is an umbrella project for automatic remediation
|
||||
system build on [Node Health Check Operator (NHC)](https://github.com/medik8s/node-healthcheck-operator) that monitors
|
||||
node conditions and delegates remediation to external remediators using the Remediation API.[Poison-Pill](https://github.com/medik8s/poison-pill)
|
||||
is a remediator that will reboot the node and make sure all statefull workloads are rescheduled. NHC supports conditionally remediating if the cluster
|
||||
has enough healthy capacity, or manually pausing any action to minimze cluster disruption.
|
||||
* [**MachineHealthCheck**](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check) of [Cluster API](https://cluster-api.sigs.k8s.io/) are responsible for remediating unhealthy Machines.
|
||||
|
||||
# Testing
|
||||
|
||||
NPD is tested via unit tests, [NPD e2e tests](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md), Kubernetes e2e tests and Kubernetes nodes e2e tests. Prow handles the [pre-submit tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-presubmits.yaml) and [CI tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-ci.yaml).
|
||||
|
||||
CI test results can be found below:
|
||||
1. [Unit tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
1. [Unit tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
|
||||
## Running tests
|
||||
|
||||
|
@ -310,6 +310,10 @@ See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-dete
|
|||
|
||||
[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY intended to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems.
|
||||
|
||||
# Compatibility
|
||||
|
||||
Node problem detector's architecture has been fairly stable. Recent versions (v0.8.13+) should be able to work with any supported kubernetes versions.
|
||||
|
||||
# Docs
|
||||
|
||||
* [Custom plugin monitor](docs/custom_plugin_monitor.md)
|
||||
|
@ -320,4 +324,4 @@ See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-dete
|
|||
* [Slides](https://docs.google.com/presentation/d/1bkJibjwWXy8YnB5fna6p-Ltiy-N5p01zUsA22wCNkXA/edit?usp=sharing)
|
||||
* [Plugin Interface Proposal](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#)
|
||||
* [Addon Manifest](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/node-problem-detector)
|
||||
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)
|
||||
* [Metrics Mode Proposal](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit)
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
# Copyright 2018 The Kubernetes Authors. All rights reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM golang:1.11.0
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update && apt-get --yes install libsystemd-dev
|
||||
RUN go version
|
||||
RUN go get github.com/tools/godep
|
||||
RUN godep version
|
||||
CMD ["/bin/bash"]
|
|
@ -0,0 +1,26 @@
|
|||
# See https://cloud.google.com/cloud-build/docs/build-config
|
||||
|
||||
# this must be specified in seconds. If omitted, defaults to 600s (10 mins)
|
||||
timeout: 3600s
|
||||
options:
|
||||
# job builds a multi-arch docker image for amd64 and arm64
|
||||
machineType: E2_HIGHCPU_8
|
||||
steps:
|
||||
- name: 'gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20230623-56e06d7c18'
|
||||
entrypoint: bash
|
||||
env:
|
||||
- PROW_GIT_TAG=$_GIT_TAG
|
||||
- PULL_BASE_REF=$_PULL_BASE_REF
|
||||
- VERSION=$_PULL_BASE_REF
|
||||
- DOCKER_CLI_EXPERIMENTAL=enabled
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
echo "Building/Pushing NPD containers"
|
||||
apk add musl-dev gcc
|
||||
make push-container
|
||||
substitutions:
|
||||
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
|
||||
# can be used as a substitution
|
||||
_GIT_TAG: 'PLACE_HOLDER'
|
||||
_PULL_BASE_REF: 'master'
|
|
@ -23,17 +23,24 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
hco := options.NewHealthCheckerOptions()
|
||||
hco.AddFlags(pflag.CommandLine)
|
||||
|
|
|
@ -39,7 +39,9 @@ type HealthCheckerOptions struct {
|
|||
EnableRepair bool
|
||||
CriCtlPath string
|
||||
CriSocketPath string
|
||||
CriTimeout time.Duration
|
||||
CoolDownTime time.Duration
|
||||
LoopBackTime time.Duration
|
||||
HealthCheckTimeout time.Duration
|
||||
LogPatterns types.LogPatternFlag
|
||||
}
|
||||
|
@ -61,8 +63,12 @@ func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
"The path to the crictl binary. This is used to check health of cri component.")
|
||||
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
|
||||
"The path to the cri socket. Used with crictl to specify the socket path.")
|
||||
fs.DurationVar(&hco.CriTimeout, "cri-timeout", types.DefaultCriTimeout,
|
||||
"The duration to wait for crictl to run.")
|
||||
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
|
||||
"The duration to wait for the service to be up before attempting repair.")
|
||||
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
|
||||
"The duration to loop back, if it is 0, health-check will check from start time.")
|
||||
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
|
||||
"The time to wait before marking the component as unhealthy.")
|
||||
fs.Var(&hco.LogPatterns, "log-pattern",
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -25,17 +26,24 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/logcounter/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/logcounter"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
fedo := options.NewLogCounterOptions()
|
||||
fedo.AddFlags(pflag.CommandLine)
|
||||
|
|
|
@ -34,6 +34,7 @@ type LogCounterOptions struct {
|
|||
Lookback string
|
||||
Delay string
|
||||
Pattern string
|
||||
RevertPattern string
|
||||
Count int
|
||||
}
|
||||
|
||||
|
@ -46,6 +47,8 @@ func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
"The time duration log watcher delays after node boot time. This is useful when log watcher needs to wait for some time until the node is stable.")
|
||||
fs.StringVar(&fedo.Pattern, "pattern", "",
|
||||
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
|
||||
fs.StringVar(&fedo.RevertPattern, "revert-pattern", "",
|
||||
"Similar to --pattern but conversely it decreases count value for every match. This is useful to discount a log when another log occurs.")
|
||||
fs.IntVar(&fedo.Count, "count", 1,
|
||||
"The number of times the pattern must be found to trigger the condition")
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,7 +17,9 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"github.com/golang/glog"
|
||||
"context"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/exporterplugins"
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/problemdaemonplugins"
|
||||
|
@ -31,16 +33,7 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
||||
func npdInteractive(npdo *options.NodeProblemDetectorOptions) {
|
||||
termCh := make(chan error, 1)
|
||||
defer close(termCh)
|
||||
|
||||
if err := npdMain(npdo, termCh); err != nil {
|
||||
glog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func npdMain(npdo *options.NodeProblemDetectorOptions, termCh <-chan error) error {
|
||||
func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) error {
|
||||
if npdo.PrintVersion {
|
||||
version.PrintVersion()
|
||||
return nil
|
||||
|
@ -53,18 +46,18 @@ func npdMain(npdo *options.NodeProblemDetectorOptions, termCh <-chan error) erro
|
|||
// Initialize problem daemons.
|
||||
problemDaemons := problemdaemon.NewProblemDaemons(npdo.MonitorConfigPaths)
|
||||
if len(problemDaemons) == 0 {
|
||||
glog.Fatalf("No problem daemon is configured")
|
||||
klog.Fatalf("No problem daemon is configured")
|
||||
}
|
||||
|
||||
// Initialize exporters.
|
||||
defaultExporters := []types.Exporter{}
|
||||
if ke := k8sexporter.NewExporterOrDie(npdo); ke != nil {
|
||||
if ke := k8sexporter.NewExporterOrDie(ctx, npdo); ke != nil {
|
||||
defaultExporters = append(defaultExporters, ke)
|
||||
glog.Info("K8s exporter started.")
|
||||
klog.Info("K8s exporter started.")
|
||||
}
|
||||
if pe := prometheusexporter.NewExporterOrDie(npdo); pe != nil {
|
||||
defaultExporters = append(defaultExporters, pe)
|
||||
glog.Info("Prometheus exporter started.")
|
||||
klog.Info("Prometheus exporter started.")
|
||||
}
|
||||
|
||||
plugableExporters := exporters.NewExporters()
|
||||
|
@ -74,10 +67,10 @@ func npdMain(npdo *options.NodeProblemDetectorOptions, termCh <-chan error) erro
|
|||
npdExporters = append(npdExporters, plugableExporters...)
|
||||
|
||||
if len(npdExporters) == 0 {
|
||||
glog.Fatalf("No exporter is successfully setup")
|
||||
klog.Fatalf("No exporter is successfully setup")
|
||||
}
|
||||
|
||||
// Initialize NPD core.
|
||||
p := problemdetector.NewProblemDetector(problemDaemons, npdExporters)
|
||||
return p.Run(termCh)
|
||||
return p.Run(ctx)
|
||||
}
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
func main() {
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
npdInteractive(npdo)
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
@ -19,9 +20,8 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
@ -81,24 +81,22 @@ func TestNPDMain(t *testing.T) {
|
|||
npdo, cleanup := setupNPD(t)
|
||||
defer cleanup()
|
||||
|
||||
termCh := make(chan error, 2)
|
||||
termCh <- errors.New("close")
|
||||
defer close(termCh)
|
||||
|
||||
if err := npdMain(npdo, termCh); err != nil {
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
cancelFunc()
|
||||
if err := npdMain(ctx, npdo); err != nil {
|
||||
t.Errorf("termination signal should not return error got, %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, ext string, contents string) (string, error) {
|
||||
f, err := ioutil.TempFile("", "*."+ext)
|
||||
f, err := os.CreateTemp("", "*."+ext)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("cannot create temp file, %v", err)
|
||||
}
|
||||
|
||||
fileName := f.Name()
|
||||
|
||||
if err := ioutil.WriteFile(fileName, []byte(contents), 0644); err != nil {
|
||||
if err := os.WriteFile(fileName, []byte(contents), 0644); err != nil {
|
||||
os.Remove(fileName)
|
||||
return "", fmt.Errorf("cannot write config to temp file %s, %v", fileName, err)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
if err := npdMain(context.Background(), npdo); err != nil {
|
||||
klog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
}
|
|
@ -17,16 +17,17 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"golang.org/x/sys/windows/svc"
|
||||
"golang.org/x/sys/windows/svc/debug"
|
||||
"golang.org/x/sys/windows/svc/eventlog"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
|
@ -43,6 +44,18 @@ var (
|
|||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
|
@ -62,7 +75,7 @@ func main() {
|
|||
func isRunningAsWindowsService() bool {
|
||||
runningAsService, err := svc.IsWindowsService()
|
||||
if err != nil {
|
||||
glog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
|
||||
klog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
|
||||
return false
|
||||
}
|
||||
return runningAsService
|
||||
|
@ -102,26 +115,20 @@ type npdService struct {
|
|||
}
|
||||
|
||||
func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (bool, uint32) {
|
||||
appTermCh := make(chan error, 1)
|
||||
svcLoopTermCh := make(chan error, 1)
|
||||
defer func() {
|
||||
close(appTermCh)
|
||||
close(svcLoopTermCh)
|
||||
}()
|
||||
|
||||
changes <- svc.Status{State: svc.StartPending}
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
var appWG sync.WaitGroup
|
||||
var svcWG sync.WaitGroup
|
||||
|
||||
options := s.options
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
|
||||
// NPD application goroutine.
|
||||
appWG.Add(1)
|
||||
go func() {
|
||||
defer appWG.Done()
|
||||
|
||||
if err := npdMain(options, appTermCh); err != nil {
|
||||
if err := npdMain(ctx, options); err != nil {
|
||||
elog.Warning(windowsEventLogID, err.Error())
|
||||
}
|
||||
|
||||
|
@ -132,16 +139,36 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
|
|||
svcWG.Add(1)
|
||||
go func() {
|
||||
defer svcWG.Done()
|
||||
|
||||
serviceLoop(r, changes, appTermCh, svcLoopTermCh)
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case c := <-r:
|
||||
switch c.Cmd {
|
||||
case svc.Interrogate:
|
||||
changes <- c.CurrentStatus
|
||||
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
changes <- c.CurrentStatus
|
||||
case svc.Stop, svc.Shutdown:
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
|
||||
cancelFunc()
|
||||
case svc.Pause:
|
||||
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
|
||||
case svc.Continue:
|
||||
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
default:
|
||||
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for the application go routine to die.
|
||||
appWG.Wait()
|
||||
|
||||
// Ensure that the service control loop is killed.
|
||||
svcLoopTermCh <- nil
|
||||
|
||||
// Wait for the service control loop to terminate.
|
||||
// Otherwise it's possible that the channel closures cause the application to panic.
|
||||
svcWG.Wait()
|
||||
|
@ -151,31 +178,3 @@ func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes
|
|||
|
||||
return false, uint32(0)
|
||||
}
|
||||
|
||||
func serviceLoop(r <-chan svc.ChangeRequest, changes chan<- svc.Status, appTermCh chan error, svcLoopTermCh chan error) {
|
||||
for {
|
||||
select {
|
||||
case <-svcLoopTermCh:
|
||||
return
|
||||
case c := <-r:
|
||||
switch c.Cmd {
|
||||
case svc.Interrogate:
|
||||
changes <- c.CurrentStatus
|
||||
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
changes <- c.CurrentStatus
|
||||
case svc.Stop, svc.Shutdown:
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
|
||||
appTermCh <- errors.New("stopping service")
|
||||
case svc.Pause:
|
||||
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
|
||||
case svc.Continue:
|
||||
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
default:
|
||||
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_custom_plugin_monitor
|
||||
// +build !disable_custom_plugin_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_stats_monitor
|
||||
// +build !disable_system_stats_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -43,6 +43,10 @@ type NodeProblemDetectorOptions struct {
|
|||
ServerPort int
|
||||
// ServerAddress is the address to bind the node problem detector server.
|
||||
ServerAddress string
|
||||
// QPS is the maximum QPS to the master from client.
|
||||
QPS float32
|
||||
// Burst is the maximum burst for throttle.
|
||||
Burst int
|
||||
|
||||
// exporter options
|
||||
|
||||
|
@ -61,6 +65,10 @@ type NodeProblemDetectorOptions struct {
|
|||
APIServerWaitInterval time.Duration
|
||||
// K8sExporterHeartbeatPeriod is the period at which the k8s exporter does forcibly sync with apiserver.
|
||||
K8sExporterHeartbeatPeriod time.Duration
|
||||
// K8sExporterWriteEvents determines whether to write Kubernetes Events for problems.
|
||||
K8sExporterWriteEvents bool
|
||||
// K8sExporterUpdateNodeConditions determines whether to update Kubernetes Node Conditions for problems.
|
||||
K8sExporterUpdateNodeConditions bool
|
||||
|
||||
// prometheusExporter options
|
||||
// PrometheusServerPort is the port to bind the Prometheus scrape endpoint. Use 0 to disable.
|
||||
|
@ -113,6 +121,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.K8sExporterHeartbeatPeriod, "k8s-exporter-heartbeat-period", 5*time.Minute, "The period at which k8s-exporter does forcibly sync with apiserver.")
|
||||
fs.BoolVar(&npdo.K8sExporterWriteEvents, "k8s-exporter-write-events", true, "Whether to write Kubernetes Event objects with event details.")
|
||||
fs.BoolVar(&npdo.K8sExporterUpdateNodeConditions, "k8s-exporter-update-node-conditions", true, "Whether to update Kubernetes Node conditions with event details.")
|
||||
fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")
|
||||
fs.StringVar(&npdo.HostnameOverride, "hostname-override",
|
||||
"", "Custom node name used to override hostname")
|
||||
|
@ -125,6 +135,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
20257, "The port to bind the Prometheus scrape endpoint. Prometheus exporter is enabled by default at port 20257. Use 0 to disable.")
|
||||
fs.StringVar(&npdo.PrometheusServerAddress, "prometheus-address",
|
||||
"127.0.0.1", "The address to bind the Prometheus scrape endpoint.")
|
||||
fs.Float32Var(&npdo.QPS, "kube-api-qps", 500, "Maximum QPS to use while talking with Kubernetes API")
|
||||
fs.IntVar(&npdo.Burst, "kube-api-burst", 500, "Maximum burst for throttle while talking with Kubernetes API")
|
||||
for _, exporterName := range exporters.GetExporterNames() {
|
||||
exporterHandler := exporters.GetExporterHandlerOrDie(exporterName)
|
||||
exporterHandler.Options.SetFlags(fs)
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "Kerneloops",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "System encountered a non-fatal error in \\S+"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"plugin": "filelog",
|
||||
"pluginConfig": {
|
||||
"timestamp": "^.{15}",
|
||||
"message": "(?i)Currently unreadable.*sectors|(?i)Offline uncorrectable sectors",
|
||||
"timestampFormat": "Jan _2 15:04:05"
|
||||
},
|
||||
"logPath": "/var/log/messages",
|
||||
"lookback": "10h",
|
||||
"bufferSize": 1,
|
||||
"source": "disk-monitor",
|
||||
"skipList": [ " audit:", " audit[" ],
|
||||
"conditions": [
|
||||
{
|
||||
"type": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"message": "Disk no bad block"
|
||||
},
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"pattern": ".*([1-9]\\d{2,}) (Currently unreadable.*sectors|Offline uncorrectable sectors).*"
|
||||
},
|
||||
]
|
||||
}
|
|
@ -25,6 +25,7 @@
|
|||
"--component=kubelet",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=1m",
|
||||
"--loopback-time=0",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "86400s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "iptables-mode-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "IPTablesVersionsMismatch",
|
||||
"path": "./config/plugin/iptables_mode.sh",
|
||||
"timeout": "5s"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -42,12 +42,6 @@
|
|||
"reason": "KernelOops",
|
||||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
|
|
|
@ -12,9 +12,14 @@
|
|||
"message": "kernel has no deadlock"
|
||||
},
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
"type": "XfsShutdown",
|
||||
"reason": "XfsHasNotShutDown",
|
||||
"message": "XFS has not shutdown"
|
||||
},
|
||||
{
|
||||
"type": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareHasNoFatalError",
|
||||
"message": "UEFI CPER has no fatal error"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
|
@ -58,28 +63,38 @@
|
|||
"reason": "IOError",
|
||||
"pattern": "Buffer I/O error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "XfsShutdown",
|
||||
"reason": "XfsHasShutdown",
|
||||
"pattern": "XFS .* Shutting down filesystem.?"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorCorrected",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: corrected$"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorRecoverable",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: recoverable$"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
"condition": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareErrorFatal",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: fatal$"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"net": {
|
||||
"excludeInterfaceRegexp": "^(cali|tunl|veth)",
|
||||
"metricsConfigs": {
|
||||
"net/rx_bytes": {
|
||||
"displayName": "net/rx_bytes"
|
||||
|
|
|
@ -20,8 +20,7 @@ if systemctl -q is-active "$SERVICE"; then
|
|||
echo "$SERVICE is running"
|
||||
exit $OK
|
||||
else
|
||||
# Does not differenciate stopped/failed service from non-existent
|
||||
# Does not differentiate stopped/failed service from non-existent
|
||||
echo "$SERVICE is not running"
|
||||
exit $NONOK
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# As of iptables 1.8, the iptables command line clients come in two different versions/modes: "legacy",
|
||||
# which uses the kernel iptables API just like iptables 1.6 and earlier did, and "nft", which translates
|
||||
# the iptables command-line API into the kernel nftables API.
|
||||
# Because they connect to two different subsystems in the kernel, you cannot mix rules from different versions.
|
||||
# Ref: https://github.com/kubernetes-sigs/iptables-wrappers
|
||||
|
||||
readonly OK=0
|
||||
readonly NONOK=1
|
||||
readonly UNKNOWN=2
|
||||
|
||||
# based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
|
||||
readonly num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -c '^-' || true)
|
||||
readonly num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -c '^-' || true)
|
||||
|
||||
|
||||
if [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Found rules from both versions, iptables-legacy: ${num_legacy_lines} iptables-nft: ${num_nft_lines}"
|
||||
echo $NONOK
|
||||
elif [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -eq 0 ]; then
|
||||
echo "Using iptables-legacy: ${num_legacy_lines} rules"
|
||||
echo $OK
|
||||
elif [ "$num_legacy_lines" -eq 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Using iptables-nft: ${num_nft_lines} rules"
|
||||
echo $OK
|
||||
else
|
||||
echo "No iptables rules found"
|
||||
echo $UNKNOWN
|
||||
fi
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -44,6 +44,9 @@
|
|||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
|
@ -88,6 +91,9 @@
|
|||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
|
@ -37,7 +37,8 @@
|
|||
"--lookback=20m",
|
||||
"--delay=5m",
|
||||
"--count=5",
|
||||
"--pattern=Started Kubernetes kubelet."
|
||||
"--pattern=Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet).",
|
||||
"--revert-pattern=Stopping (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -51,7 +52,8 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting Docker Application Container Engine..."
|
||||
"--pattern=Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)...",
|
||||
"--revert-pattern=Stopping (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -65,7 +67,8 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting containerd container runtime..."
|
||||
"--pattern=Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)...",
|
||||
"--revert-pattern=Stopping (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
}
|
||||
|
|
|
@ -13,17 +13,17 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "KubeletStart",
|
||||
"pattern": "Started Kubernetes kubelet."
|
||||
"pattern": "Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "DockerStart",
|
||||
"pattern": "Starting Docker Application Container Engine..."
|
||||
"pattern": "Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "ContainerdStart",
|
||||
"pattern": "Starting containerd container runtime..."
|
||||
"pattern": "Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ Restart=always
|
|||
RestartSec=10
|
||||
ExecStart=/home/kubernetes/bin/node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false \
|
||||
--exporter.stackdriver=/home/kubernetes/node-problem-detector/config/exporter/stackdriver-exporter.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/readonly-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.custom-plugin-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor-counter.json,/home/kubernetes/node-problem-detector/config/systemd-monitor-counter.json \
|
||||
--config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json,/home/kubernetes/node-problem-detector/config/net-cgroup-system-stats-monitor.json
|
||||
|
||||
|
|
|
@ -20,6 +20,11 @@
|
|||
"type": "temporary",
|
||||
"reason": "CorruptContainerImageLayer",
|
||||
"pattern": ".*failed to pull and unpack image.*failed to extract layer.*archive/tar: invalid tar header.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "HCSEmptyLayerchain",
|
||||
"pattern": ".*Failed to unmarshall layerchain json - invalid character '\\x00' looking for beginning of value*"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "WindowsDefenderThreatsDetected",
|
||||
"path": "./config/plugin/windows_defender_problem.ps1",
|
||||
"path": "C:\\etc\\kubernetes\\node-problem-detector\\config\\plugin\\windows_defender_problem.ps1",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -44,6 +44,9 @@
|
|||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
|
@ -88,6 +91,9 @@
|
|||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,12 +50,6 @@ data:
|
|||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
|
@ -70,6 +64,30 @@ data:
|
|||
}
|
||||
]
|
||||
}
|
||||
readonly-monitor.json: |
|
||||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
docker-monitor.json: |
|
||||
{
|
||||
"plugin": "journald",
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-problem-detector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
# Make sure node problem detector is in the same timezone
|
||||
# with the host.
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
- mountPath: /etc/machine-id
|
||||
name: machine-id
|
||||
readOnly: true
|
||||
- mountPath: /run/systemd/system
|
||||
name: systemd
|
||||
- mountPath: /var/run/dbus/
|
||||
name: dbus
|
||||
mountPropagation: Bidirectional
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: config
|
||||
configMap:
|
||||
name: node-problem-detector-config
|
||||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
- name: machine-id
|
||||
hostPath:
|
||||
path: /etc/machine-id
|
||||
type: "File"
|
||||
- name: systemd
|
||||
hostPath:
|
||||
path: /run/systemd/system/
|
||||
type: ""
|
||||
- name: dbus
|
||||
hostPath:
|
||||
path: /var/run/dbus/
|
||||
type: ""
|
|
@ -28,8 +28,8 @@ spec:
|
|||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
|
||||
image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
|
@ -60,6 +60,7 @@ spec:
|
|||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
serviceAccountName: node-problem-detector
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
|
@ -77,6 +78,8 @@ spec:
|
|||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
tolerations:
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
|
@ -1,9 +1,62 @@
|
|||
# Custom Plugin Monitor
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin Config
|
||||
|
||||
* `invoke_interval`: Interval at which custom plugins will be invoked.
|
||||
* `timeout`: Time after which custom plugins invokation will be terminated and considered timeout.
|
||||
* `timeout`: Time after which custom plugins invocation will be terminated and considered timeout.
|
||||
* `max_output_length`: The maximum standard output size from custom plugins that NPD will be cut and use for condition status message.
|
||||
* `concurrency`: The plugin worker number, i.e., how many custom plugins will be invoked concurrently.
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
||||
* `skip_initial_status`: Flag controls whether condition will be emitted during plugin initialization.
|
||||
|
||||
### Annotated Plugin Configuration Example
|
||||
|
||||
```
|
||||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "30s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 3,
|
||||
"enable_message_change_based_condition_update": false
|
||||
},
|
||||
"source": "ntp-custom-plugin-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "NTPProblem",
|
||||
"reason": "NTPIsUp", // This is the default reason shown when healthy
|
||||
"message": "ntp service is up" // This is the default message shown when healthy
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary", // These are not shown unless there's an
|
||||
// event so they always relate to a problem.
|
||||
// There are no defaults since there is nothing
|
||||
// to show unless there's a problem.
|
||||
"reason": "NTPIsDown", // This is the reason shown for this event
|
||||
// and the message shown comes from stdout.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
},
|
||||
{
|
||||
"type": "permanent", // These are permanent and are shown in the Conditions section
|
||||
// when running `kubectl describe node ...`
|
||||
// They have default values shown above in the conditions section
|
||||
// and also a reason for each specific trigger listed in this rules section.
|
||||
// Message will come from default for healthy times
|
||||
// and during unhealthy time message comes from stdout of the check.
|
||||
|
||||
"condition": "NTPProblem", // This is the key to connect to the corresponding condition listed above
|
||||
"reason": "NTPIsDown", // and the reason shown for failures detected in this rule
|
||||
// and message will be from stdout of the check.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
|
|
@ -4,6 +4,12 @@ These are notes to help follow a consistent release process. See something
|
|||
important missing? Please submit a pull request to add anything else that would
|
||||
be useful!
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Ensure access to the container image [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
Add email to `k8s-infra-staging-npd` group in sig-node [groups.yaml](https://github.com/kubernetes/k8s.io/blob/main/groups/sig-node/groups.yaml).
|
||||
See example https://github.com/kubernetes/k8s.io/pull/1599.
|
||||
|
||||
## Preparing for a release
|
||||
|
||||
There are a few steps that should be taken prior to creating the actual release
|
||||
|
@ -11,37 +17,100 @@ itself.
|
|||
|
||||
1. Collect changes since last release. This can be done by looking directly at
|
||||
merged commit messages (``git log [last_release_tag]...HEAD``), or by
|
||||
viewing the changes on GitHub ([example:
|
||||
https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master](https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...master)).
|
||||
viewing the changes on GitHub (example: https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...master).
|
||||
|
||||
1. Based on the changes to be included in the release, determine what the next
|
||||
2. Based on the changes to be included in the release, determine what the next
|
||||
release number should be. We strive to follow [SemVer](https://semver.org/)
|
||||
as much as possible.
|
||||
|
||||
1. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
|
||||
3. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
|
||||
with all significant changes.
|
||||
|
||||
## Create release
|
||||
|
||||
Once changes have been merged to the CHANGELOG, perform the actual release via
|
||||
GitHub. When creating the release, make sure to include the following in the
|
||||
body of the release:
|
||||
### Create the new version tag
|
||||
|
||||
#### Option 1
|
||||
```
|
||||
# Use v0.8.17 as an example.
|
||||
git clone git@github.com:kubernetes/node-problem-detector.git
|
||||
cd node-problem-detector/
|
||||
git tag v0.8.17
|
||||
git push origin v0.8.17
|
||||
```
|
||||
|
||||
#### Option 2
|
||||
Update [version.txt](https://github.com/kubernetes/node-problem-detector/blob/master/version.txt)
|
||||
(example https://github.com/kubernetes/node-problem-detector/pull/869).
|
||||
|
||||
### Build and push artifacts
|
||||
This step builds the NPD into container files and tar files.
|
||||
- The container file is pushed to the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
You will promote the new image to registry.k8s.io later.
|
||||
- The tar files are generated locally. You will upload those to github in the
|
||||
release note later.
|
||||
|
||||
**Note: You need the access mentioned in the [prerequisites](#prerequisites)
|
||||
section to perform steps in this section.**
|
||||
|
||||
```
|
||||
# One-time setup
|
||||
sudo apt-get install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
|
||||
cd node-problem-detector
|
||||
make release
|
||||
|
||||
# Get SHA256 of the tar files. For example
|
||||
sha256sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Get MD5 of the tar files. For example
|
||||
md5sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Verify container image in staging registry and get SHA256.
|
||||
docker pull gcr.io/k8s-staging-npd/node-problem-detector:v0.8.17
|
||||
docker image ls gcr.io/k8s-staging-npd/node-problem-detector --digests
|
||||
```
|
||||
|
||||
### Promote new NPD image to registry.k8s.io
|
||||
1. Get the SHA256 from the new NPD image from the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector)
|
||||
or previous step.
|
||||
2. Promote the NPD image to registry.k8s.io ([images.yaml](https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-npd/images.yaml), example https://github.com/kubernetes/k8s.io/pull/6523).
|
||||
3. Verify the container image.
|
||||
```
|
||||
docker pull registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
docker image ls registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
```
|
||||
|
||||
### Create the release note
|
||||
|
||||
Go to https://github.com/kubernetes/node-problem-detector/releases, draft a new
|
||||
release note and publish. Make sure to include the following in the body of the
|
||||
release note:
|
||||
|
||||
1. For convenience, add a link to easily view the changes since the last
|
||||
release (e.g.
|
||||
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6](https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6)).
|
||||
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17](https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17)).
|
||||
|
||||
1. There is no need to duplicate everything from the CHANGELOG, but include the
|
||||
2. There is no need to duplicate everything from the CHANGELOG, but include the
|
||||
most significant things so someone just viewing the release entry will have
|
||||
an idea of what it includes.
|
||||
|
||||
1. Provide a link to the new image release (e.g. `Image:
|
||||
k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.6`)
|
||||
3. Provide a link to the new image release (e.g. `Image:
|
||||
registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17`)
|
||||
|
||||
4. Upload the tar files built from [pevious step](#build-and-push-artifacts),
|
||||
and include the SHA and MD5.
|
||||
|
||||
## Post release steps
|
||||
|
||||
1. Update image version in
|
||||
[deployment/node-problem-detector.yaml](https://github.com/kubernetes/node-problem-detector/blob/422c088d623488be33aa697588655440c4e6a063/deployment/node-problem-detector.yaml#L32).
|
||||
1. Update image version in node-problem-detector repo, so anyone deploying
|
||||
directly from the repo deployment file will get the newest image deployed.
|
||||
Example https://github.com/kubernetes/node-problem-detector/pull/897.
|
||||
|
||||
Update the image version in the deployment file so anyone deploying directly
|
||||
from the repo deployment file will get the newest image deployed.
|
||||
2. Update the NPD version in [kubernetes/kubernetes](https://github.com/kubernetes/kubernetes)
|
||||
repo, so that kubernetes clusters use the new NPD version. Example
|
||||
https://github.com/kubernetes/kubernetes/pull/123740.
|
||||
|
|
135
go.mod
135
go.mod
|
@ -1,41 +1,110 @@
|
|||
module k8s.io/node-problem-detector
|
||||
|
||||
go 1.15
|
||||
go 1.24.2
|
||||
|
||||
require (
|
||||
cloud.google.com/go v0.45.1
|
||||
code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.0.0-20190427222117-f6cda26f80a3
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.13.4
|
||||
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
|
||||
github.com/avast/retry-go v2.4.1+incompatible
|
||||
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249
|
||||
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
|
||||
cloud.google.com/go/compute/metadata v0.6.0
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.4.2
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.13.14
|
||||
github.com/acobaugh/osrelease v0.1.0
|
||||
github.com/avast/retry-go/v4 v4.6.1
|
||||
github.com/coreos/go-systemd/v22 v22.5.0
|
||||
github.com/euank/go-kmsg-parser v2.0.0+incompatible
|
||||
github.com/go-ole/go-ole v1.2.4 // indirect
|
||||
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
|
||||
github.com/google/cadvisor v0.36.0
|
||||
github.com/hpcloud/tail v1.0.0
|
||||
github.com/onsi/ginkgo v1.10.3
|
||||
github.com/onsi/gomega v1.7.1
|
||||
github.com/pborman/uuid v1.2.0
|
||||
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4
|
||||
github.com/prometheus/common v0.4.1
|
||||
github.com/prometheus/procfs v0.2.0
|
||||
github.com/shirou/gopsutil v2.19.12+incompatible
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/stretchr/testify v1.6.1
|
||||
github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc // indirect
|
||||
go.opencensus.io v0.22.4
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
|
||||
golang.org/x/sys v0.0.0-20201211090839-8ad439b19e0f
|
||||
google.golang.org/api v0.10.0
|
||||
k8s.io/api v0.0.0-20190816222004-e3a6b8045b0b
|
||||
k8s.io/apimachinery v0.0.0-20190816221834-a9f1d8a9c101
|
||||
k8s.io/client-go v11.0.1-0.20190805182717-6502b5e7b1b5+incompatible
|
||||
k8s.io/heapster v0.0.0-20180704153620-b25f8a16208f
|
||||
k8s.io/kubernetes v1.14.6
|
||||
k8s.io/test-infra v0.0.0-20190914015041-e1cbc3ccd91c
|
||||
github.com/prometheus/client_model v0.6.2
|
||||
github.com/prometheus/common v0.63.0
|
||||
github.com/prometheus/procfs v0.16.1
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
github.com/spf13/pflag v1.0.6
|
||||
github.com/stretchr/testify v1.10.0
|
||||
go.opencensus.io v0.24.0
|
||||
golang.org/x/sys v0.32.0
|
||||
google.golang.org/api v0.230.0
|
||||
k8s.io/api v0.33.0
|
||||
k8s.io/apimachinery v0.33.0
|
||||
k8s.io/client-go v0.33.0
|
||||
k8s.io/klog/v2 v2.130.1
|
||||
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e
|
||||
)
|
||||
|
||||
replace git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999
|
||||
require (
|
||||
cloud.google.com/go/auth v0.16.0 // indirect
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
|
||||
cloud.google.com/go/monitoring v1.20.3 // indirect
|
||||
cloud.google.com/go/trace v1.10.11 // indirect
|
||||
github.com/aws/aws-sdk-go v1.44.72 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
|
||||
github.com/go-kit/log v0.2.1 // indirect
|
||||
github.com/go-logfmt/logfmt v0.5.1 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/gnostic-models v0.6.9 // indirect
|
||||
github.com/google/go-cmp v0.7.0 // indirect
|
||||
github.com/google/s2a-go v0.1.9 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
|
||||
github.com/jmespath/go-jmespath v0.4.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.9 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/prometheus/client_golang v1.20.4 // indirect
|
||||
github.com/prometheus/prometheus v0.35.0 // indirect
|
||||
github.com/prometheus/statsd_exporter v0.22.7 // indirect
|
||||
github.com/shoenig/go-m1cpu v0.1.6 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||
go.opentelemetry.io/otel v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.35.0 // indirect
|
||||
golang.org/x/crypto v0.37.0 // indirect
|
||||
golang.org/x/net v0.39.0 // indirect
|
||||
golang.org/x/oauth2 v0.29.0 // indirect
|
||||
golang.org/x/sync v0.13.0 // indirect
|
||||
golang.org/x/term v0.31.0 // indirect
|
||||
golang.org/x/text v0.24.0 // indirect
|
||||
golang.org/x/time v0.11.0 // indirect
|
||||
google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect
|
||||
google.golang.org/grpc v1.72.0 // indirect
|
||||
google.golang.org/protobuf v1.36.6 // indirect
|
||||
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
|
||||
gopkg.in/fsnotify.v1 v1.4.7 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
|
||||
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
|
||||
sigs.k8s.io/randfill v1.0.0 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
)
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
VERSION="$1"
|
||||
|
||||
NPD_LINUX_AMD64=node-problem-detector-${VERSION}-linux_amd64.tar.gz
|
||||
NPD_LINUX_ARM64=node-problem-detector-${VERSION}-linux_arm64.tar.gz
|
||||
NPD_WINDOWS_AMD64=node-problem-detector-${VERSION}-windows_amd64.tar.gz
|
||||
|
||||
SHA_NPD_LINUX_AMD64=$(sha256sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
SHA_NPD_LINUX_ARM64=$(sha256sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
SHA_NPD_WINDOWS_AMD64=$(sha256sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
MD5_NPD_LINUX_AMD64=$(md5sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
MD5_NPD_LINUX_ARM64=$(md5sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
MD5_NPD_WINDOWS_AMD64=$(md5sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
echo
|
||||
echo **${NPD_LINUX_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_AMD64}
|
||||
echo
|
||||
echo **${NPD_LINUX_ARM64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_ARM64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_ARM64}
|
||||
echo
|
||||
echo **${NPD_WINDOWS_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_WINDOWS_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_WINDOWS_AMD64}
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash -xe
|
||||
|
||||
# Copyright 2023 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
VERSION=$(cat version.txt)
|
||||
|
||||
if [[ ! "${VERSION}" =~ ^v([0-9]+[.][0-9]+)[.]([0-9]+)(-(alpha|beta)[.]([0-9]+))?$ ]]; then
|
||||
echo "Version ${VERSION} must be 'X.Y.Z', 'X.Y.Z-alpha.N', or 'X.Y.Z-beta.N'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$(git tag -l "${VERSION}")" ]; then
|
||||
echo "Tag ${VERSION} already exists"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git tag -a -m "Release ${VERSION}" "${VERSION}"
|
||||
git push origin "${VERSION}"
|
||||
|
||||
echo "release_tag=refs/tags/${VERSION}" >> $GITHUB_OUTPUT
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
make gomod
|
||||
changes=$(git status --porcelain go.mod go.sum vendor/ tests/e2e/go.mod tests/e2e/go.sum || true)
|
||||
if [ -n "${changes}" ]; then
|
||||
echo "ERROR: go modules are not up to date; please run: make gomod"
|
||||
echo "changed files:"
|
||||
printf "%s" "${changes}\n"
|
||||
echo "git diff:"
|
||||
git --no-pager diff
|
||||
exit 1
|
||||
fi
|
|
@ -18,10 +18,10 @@ package custompluginmonitor
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
|
@ -47,7 +47,6 @@ type customPluginMonitor struct {
|
|||
config cpmtypes.CustomPluginConfig
|
||||
conditions []types.Condition
|
||||
plugin *plugin.Plugin
|
||||
resultChan <-chan cpmtypes.Result
|
||||
statusChan chan *types.Status
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
@ -58,27 +57,27 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
|
|||
configPath: configPath,
|
||||
tomb: tomb.NewTomb(),
|
||||
}
|
||||
f, err := ioutil.ReadFile(configPath)
|
||||
f, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &c.config)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
}
|
||||
// Apply configurations
|
||||
err = (&c.config).ApplyConfiguration()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
}
|
||||
|
||||
// Validate configurations
|
||||
err = c.config.Validate()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
klog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
}
|
||||
|
||||
glog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
klog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
|
||||
c.plugin = plugin.NewPlugin(c.config)
|
||||
// A 1000 size channel should be big enough.
|
||||
|
@ -97,32 +96,39 @@ func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
|
|||
if rule.Type == types.Perm {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
rule.Condition, rule.Reason, err)
|
||||
}
|
||||
}
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
|
||||
glog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
klog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
go c.plugin.Run()
|
||||
go c.monitorLoop()
|
||||
return c.statusChan, nil
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Stop() {
|
||||
glog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
klog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
c.tomb.Stop()
|
||||
}
|
||||
|
||||
// monitorLoop is the main loop of customPluginMonitor.
|
||||
// there is one customPluginMonitor, one plugin instance for each configPath.
|
||||
// each runs rules in parallel at pre-configured concurrency, and interval.
|
||||
func (c *customPluginMonitor) monitorLoop() {
|
||||
c.initializeStatus()
|
||||
c.initializeConditions()
|
||||
if *c.config.PluginGlobalConfig.SkipInitialStatus {
|
||||
klog.Infof("Skipping sending initial status. Using default conditions: %+v", c.conditions)
|
||||
} else {
|
||||
c.sendInitialStatus()
|
||||
}
|
||||
|
||||
resultChan := c.plugin.GetResultChan()
|
||||
|
||||
|
@ -130,16 +136,16 @@ func (c *customPluginMonitor) monitorLoop() {
|
|||
select {
|
||||
case result, ok := <-resultChan:
|
||||
if !ok {
|
||||
glog.Errorf("Result channel closed: %s", c.configPath)
|
||||
klog.Errorf("Result channel closed: %s", c.configPath)
|
||||
return
|
||||
}
|
||||
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
klog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
status := c.generateStatus(result)
|
||||
glog.V(3).Infof("New status generated: %+v", status)
|
||||
klog.V(3).Infof("New status generated: %+v", status)
|
||||
c.statusChan <- status
|
||||
case <-c.tomb.Stopping():
|
||||
c.plugin.Stop()
|
||||
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
klog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
c.tomb.Done()
|
||||
return
|
||||
}
|
||||
|
@ -232,6 +238,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
condition.Type,
|
||||
status,
|
||||
newReason,
|
||||
newMessage,
|
||||
timestamp,
|
||||
)
|
||||
|
||||
|
@ -252,7 +259,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
|
||||
event.Reason, 1)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
klog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
event.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -260,7 +267,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
|
||||
condition.Type, condition.Reason, condition.Status == types.True)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
condition.Type, condition.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -273,7 +280,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
}
|
||||
// Log only if condition has changed
|
||||
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
|
||||
glog.V(0).Infof("New status generated: %+v", status)
|
||||
klog.V(0).Infof("New status generated: %+v", status)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
@ -289,11 +296,9 @@ func toConditionStatus(s cpmtypes.Status) types.ConditionStatus {
|
|||
}
|
||||
}
|
||||
|
||||
// initializeStatus initializes the internal condition and also reports it to the node problem detector.
|
||||
func (c *customPluginMonitor) initializeStatus() {
|
||||
// Initialize the default node conditions
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
glog.Infof("Initialize condition generated: %+v", c.conditions)
|
||||
// sendInitialStatus sends the initial status to the node problem detector.
|
||||
func (c *customPluginMonitor) sendInitialStatus() {
|
||||
klog.Infof("Sending initial status for %s with conditions: %+v", c.config.Source, c.conditions)
|
||||
// Update the initial status
|
||||
c.statusChan <- &types.Status{
|
||||
Source: c.config.Source,
|
||||
|
@ -301,6 +306,12 @@ func (c *customPluginMonitor) initializeStatus() {
|
|||
}
|
||||
}
|
||||
|
||||
// initializeConditions initializes the internal node conditions.
|
||||
func (c *customPluginMonitor) initializeConditions() {
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
klog.Infof("Initialized conditions for %s: %+v", c.configPath, c.conditions)
|
||||
}
|
||||
|
||||
func initialConditions(defaults []types.Condition) []types.Condition {
|
||||
conditions := make([]types.Condition, len(defaults))
|
||||
copy(conditions, defaults)
|
||||
|
|
|
@ -20,14 +20,13 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
"k8s.io/node-problem-detector/pkg/util/tomb"
|
||||
|
@ -61,7 +60,7 @@ func (p *Plugin) GetResultChan() <-chan cpmtypes.Result {
|
|||
|
||||
func (p *Plugin) Run() {
|
||||
defer func() {
|
||||
glog.Info("Stopping plugin execution")
|
||||
klog.Info("Stopping plugin execution")
|
||||
close(p.resultChan)
|
||||
p.tomb.Done()
|
||||
}()
|
||||
|
@ -90,9 +89,10 @@ func (p *Plugin) Run() {
|
|||
|
||||
// run each rule in parallel and wait for them to complete
|
||||
func (p *Plugin) runRules() {
|
||||
glog.V(3).Info("Start to run custom plugins")
|
||||
klog.V(3).Info("Start to run custom plugins")
|
||||
|
||||
for _, rule := range p.config.Rules {
|
||||
// syncChan limits concurrent goroutines to configured PluginGlobalConfig.Concurrency value
|
||||
p.syncChan <- struct{}{}
|
||||
p.Add(1)
|
||||
go func(rule *cpmtypes.CustomRule) {
|
||||
|
@ -103,8 +103,12 @@ func (p *Plugin) runRules() {
|
|||
|
||||
start := time.Now()
|
||||
exitStatus, message := p.run(*rule)
|
||||
level := klog.Level(3)
|
||||
if exitStatus != 0 {
|
||||
level = klog.Level(2)
|
||||
}
|
||||
|
||||
glog.V(3).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
klog.V(level).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
|
||||
result := cpmtypes.Result{
|
||||
Rule: rule,
|
||||
|
@ -112,26 +116,27 @@ func (p *Plugin) runRules() {
|
|||
Message: message,
|
||||
}
|
||||
|
||||
// pipes result into resultChan which customPluginMonitor instance generates status from
|
||||
p.resultChan <- result
|
||||
|
||||
// Let the result be logged at a higher verbosity level. If there is a change in status it is logged later.
|
||||
glog.V(3).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
klog.V(level).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
}(rule)
|
||||
}
|
||||
|
||||
p.Wait()
|
||||
glog.V(3).Info("Finish running custom plugins")
|
||||
klog.V(3).Info("Finish running custom plugins")
|
||||
}
|
||||
|
||||
// readFromReader reads the maxBytes from the reader and drains the rest.
|
||||
func readFromReader(reader io.ReadCloser, maxBytes int64) ([]byte, error) {
|
||||
limitReader := io.LimitReader(reader, maxBytes)
|
||||
data, err := ioutil.ReadAll(limitReader)
|
||||
data, err := io.ReadAll(limitReader)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
// Drain the reader
|
||||
if _, err := io.Copy(ioutil.Discard, reader); err != nil {
|
||||
if _, err := io.Copy(io.Discard, reader); err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
return data, nil
|
||||
|
@ -152,16 +157,16 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
|
||||
stdoutPipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
glog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stdout pipe for plugin. Please check the error log"
|
||||
}
|
||||
stderrPipe, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
glog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stderr pipe for plugin. Please check the error log"
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
glog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error in starting plugin. Please check the error log"
|
||||
}
|
||||
|
||||
|
@ -177,9 +182,9 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
if ctx.Err() == context.Canceled {
|
||||
return
|
||||
}
|
||||
glog.Errorf("Error in running plugin timeout %q", rule.Path)
|
||||
klog.Errorf("Error in running plugin timeout %q", rule.Path)
|
||||
if cmd.Process == nil || cmd.Process.Pid == 0 {
|
||||
glog.Errorf("Error in cmd.Process check %q", rule.Path)
|
||||
klog.Errorf("Error in cmd.Process check %q", rule.Path)
|
||||
break
|
||||
}
|
||||
|
||||
|
@ -189,7 +194,7 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
|
||||
err := util.Kill(cmd)
|
||||
if err != nil {
|
||||
glog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
|
||||
klog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
|
||||
}
|
||||
case <-waitChan:
|
||||
return
|
||||
|
@ -218,18 +223,18 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
wg.Wait()
|
||||
|
||||
if stdoutErr != nil {
|
||||
glog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stdout for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if stderrErr != nil {
|
||||
glog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stderr for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
if _, ok := err.(*exec.ExitError); !ok {
|
||||
glog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
klog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
return cpmtypes.Unknown, "Error in waiting for plugin. Please check the error log"
|
||||
}
|
||||
}
|
||||
|
@ -268,12 +273,12 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
// Stop the plugin.
|
||||
func (p *Plugin) Stop() {
|
||||
p.tomb.Stop()
|
||||
glog.Info("Stop plugin execution")
|
||||
klog.Info("Stop plugin execution")
|
||||
}
|
||||
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel glog.Level) {
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel klog.Level) {
|
||||
if len(logs) != 0 {
|
||||
glog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
glog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
klog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
klog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ var (
|
|||
defaultConcurrency = 3
|
||||
defaultMessageChangeBasedConditionUpdate = false
|
||||
defaultEnableMetricsReporting = true
|
||||
defaultSkipInitialStatus = false
|
||||
|
||||
customPluginName = "custom"
|
||||
)
|
||||
|
@ -52,9 +53,11 @@ type pluginGlobalConfig struct {
|
|||
Concurrency *int `json:"concurrency,omitempty"`
|
||||
// EnableMessageChangeBasedConditionUpdate indicates whether NPD should enable message change based condition update.
|
||||
EnableMessageChangeBasedConditionUpdate *bool `json:"enable_message_change_based_condition_update,omitempty"`
|
||||
// SkipInitialStatus prevents the first status update with default conditions
|
||||
SkipInitialStatus *bool `json:"skip_initial_status,omitempty"`
|
||||
}
|
||||
|
||||
// Custom plugin config is the configuration of custom plugin monitor.
|
||||
// CustomPluginConfig is the configuration of custom plugin monitor.
|
||||
type CustomPluginConfig struct {
|
||||
// Plugin is the name of plugin which is currently used.
|
||||
// Currently supported: custom.
|
||||
|
@ -105,6 +108,10 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
|
|||
cpc.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate = &defaultMessageChangeBasedConditionUpdate
|
||||
}
|
||||
|
||||
if cpc.PluginGlobalConfig.SkipInitialStatus == nil {
|
||||
cpc.PluginGlobalConfig.SkipInitialStatus = &defaultSkipInitialStatus
|
||||
}
|
||||
|
||||
for _, rule := range cpc.Rules {
|
||||
if rule.TimeoutString != nil {
|
||||
timeout, err := time.ParseDuration(*rule.TimeoutString)
|
||||
|
|
|
@ -33,6 +33,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
concurrency := 2
|
||||
messageChangeBasedConditionUpdate := true
|
||||
disableMetricsReporting := false
|
||||
disableInitialStatusUpdate := true
|
||||
|
||||
ruleTimeout := 1 * time.Second
|
||||
ruleTimeoutString := ruleTimeout.String()
|
||||
|
@ -62,6 +63,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
Rules: []*CustomRule{
|
||||
|
@ -91,6 +93,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -110,6 +113,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -129,6 +133,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &maxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -148,6 +153,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &concurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -167,6 +173,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &messageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -184,10 +191,30 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &disableMetricsReporting,
|
||||
},
|
||||
},
|
||||
"disable status update during initialization": {
|
||||
Orig: CustomPluginConfig{PluginGlobalConfig: pluginGlobalConfig{
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
},
|
||||
Wanted: CustomPluginConfig{
|
||||
PluginGlobalConfig: pluginGlobalConfig{
|
||||
InvokeIntervalString: &defaultInvokeIntervalString,
|
||||
InvokeInterval: &defaultInvokeInterval,
|
||||
TimeoutString: &defaultGlobalTimeoutString,
|
||||
Timeout: &defaultGlobalTimeout,
|
||||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for desp, utMeta := range utMetas {
|
||||
|
|
|
@ -17,8 +17,9 @@ limitations under the License.
|
|||
package types
|
||||
|
||||
import (
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
||||
type Status int
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -25,10 +26,10 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/types"
|
||||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -49,7 +50,7 @@ const (
|
|||
// not. This addresses 3).
|
||||
type ConditionManager interface {
|
||||
// Start starts the condition manager.
|
||||
Start()
|
||||
Start(ctx context.Context)
|
||||
// UpdateCondition updates a specific condition.
|
||||
UpdateCondition(types.Condition)
|
||||
// GetConditions returns all current conditions.
|
||||
|
@ -67,7 +68,7 @@ type conditionManager struct {
|
|||
// No lock is needed in `sync`, because it is in the same goroutine with the
|
||||
// write operation.
|
||||
sync.RWMutex
|
||||
clock clock.Clock
|
||||
clock clock.WithTicker
|
||||
latestTry time.Time
|
||||
resyncNeeded bool
|
||||
client problemclient.Client
|
||||
|
@ -78,18 +79,18 @@ type conditionManager struct {
|
|||
}
|
||||
|
||||
// NewConditionManager creates a condition manager.
|
||||
func NewConditionManager(client problemclient.Client, clock clock.Clock, heartbeatPeriod time.Duration) ConditionManager {
|
||||
func NewConditionManager(client problemclient.Client, clockInUse clock.WithTicker, heartbeatPeriod time.Duration) ConditionManager {
|
||||
return &conditionManager{
|
||||
client: client,
|
||||
clock: clock,
|
||||
clock: clockInUse,
|
||||
updates: make(map[string]types.Condition),
|
||||
conditions: make(map[string]types.Condition),
|
||||
heartbeatPeriod: heartbeatPeriod,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *conditionManager) Start() {
|
||||
go c.syncLoop()
|
||||
func (c *conditionManager) Start(ctx context.Context) {
|
||||
go c.syncLoop(ctx)
|
||||
}
|
||||
|
||||
func (c *conditionManager) UpdateCondition(condition types.Condition) {
|
||||
|
@ -110,15 +111,17 @@ func (c *conditionManager) GetConditions() []types.Condition {
|
|||
return conditions
|
||||
}
|
||||
|
||||
func (c *conditionManager) syncLoop() {
|
||||
func (c *conditionManager) syncLoop(ctx context.Context) {
|
||||
ticker := c.clock.NewTicker(updatePeriod)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C():
|
||||
if c.needUpdates() || c.needResync() || c.needHeartbeat() {
|
||||
c.sync()
|
||||
c.sync(ctx)
|
||||
}
|
||||
case <-ctx.Done():
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -150,16 +153,16 @@ func (c *conditionManager) needHeartbeat() bool {
|
|||
}
|
||||
|
||||
// sync synchronizes node conditions with the apiserver.
|
||||
func (c *conditionManager) sync() {
|
||||
func (c *conditionManager) sync(ctx context.Context) {
|
||||
c.latestTry = c.clock.Now()
|
||||
c.resyncNeeded = false
|
||||
conditions := []v1.NodeCondition{}
|
||||
for i := range c.conditions {
|
||||
conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))
|
||||
}
|
||||
if err := c.client.SetConditions(conditions); err != nil {
|
||||
if err := c.client.SetConditions(ctx, conditions); err != nil {
|
||||
// The conditions will be updated again in future sync
|
||||
glog.Errorf("failed to update node conditions: %v", err)
|
||||
klog.Errorf("failed to update node conditions: %v", err)
|
||||
c.resyncNeeded = true
|
||||
return
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -28,14 +29,14 @@ import (
|
|||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
)
|
||||
|
||||
const heartbeatPeriod = 1 * time.Minute
|
||||
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *clock.FakeClock) {
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *testclock.FakeClock) {
|
||||
fakeClient := problemclient.NewFakeProblemClient()
|
||||
fakeClock := clock.NewFakeClock(time.Now())
|
||||
fakeClock := testclock.NewFakeClock(time.Now())
|
||||
manager := NewConditionManager(fakeClient, fakeClock, heartbeatPeriod)
|
||||
return manager.(*conditionManager), fakeClient, fakeClock
|
||||
}
|
||||
|
@ -109,7 +110,7 @@ func TestResync(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
@ -118,7 +119,7 @@ func TestResync(t *testing.T) {
|
|||
assert.False(t, m.needResync(), "Should not resync after resync period without resync needed")
|
||||
|
||||
fakeClient.InjectError("SetConditions", fmt.Errorf("injected error"))
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
|
||||
assert.False(t, m.needResync(), "Should not resync before resync period")
|
||||
fakeClock.Step(resyncPeriod)
|
||||
|
@ -129,7 +130,7 @@ func TestHeartbeat(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
|
|
@ -17,15 +17,16 @@ limitations under the License.
|
|||
package k8sexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"net/http"
|
||||
_ "net/http/pprof"
|
||||
"net/http/pprof"
|
||||
"strconv"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/exporters/k8sexporter/condition"
|
||||
|
@ -37,6 +38,8 @@ import (
|
|||
type k8sExporter struct {
|
||||
client problemclient.Client
|
||||
conditionManager condition.ConditionManager
|
||||
writeEvents bool
|
||||
updateConditions bool
|
||||
}
|
||||
|
||||
// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting,
|
||||
|
@ -44,35 +47,41 @@ type k8sExporter struct {
|
|||
//
|
||||
// Note that this function may be blocked (until a timeout occurs) before
|
||||
// kube-apiserver becomes ready.
|
||||
func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
func NewExporterOrDie(ctx context.Context, npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
if !npdo.EnableK8sExporter {
|
||||
return nil
|
||||
}
|
||||
|
||||
c := problemclient.NewClientOrDie(npdo)
|
||||
|
||||
glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil {
|
||||
glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
klog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(ctx, c, npdo); err != nil {
|
||||
klog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
}
|
||||
|
||||
ke := k8sExporter{
|
||||
client: c,
|
||||
conditionManager: condition.NewConditionManager(c, clock.RealClock{}, npdo.K8sExporterHeartbeatPeriod),
|
||||
writeEvents: npdo.K8sExporterWriteEvents,
|
||||
updateConditions: npdo.K8sExporterUpdateNodeConditions,
|
||||
}
|
||||
|
||||
ke.startHTTPReporting(npdo)
|
||||
ke.conditionManager.Start()
|
||||
ke.conditionManager.Start(ctx)
|
||||
|
||||
return &ke
|
||||
}
|
||||
|
||||
func (ke *k8sExporter) ExportProblems(status *types.Status) {
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
if ke.writeEvents {
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
}
|
||||
}
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
if ke.updateConditions {
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -94,22 +103,30 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio
|
|||
util.ReturnHTTPJson(w, ke.conditionManager.GetConditions())
|
||||
})
|
||||
|
||||
// register pprof
|
||||
mux.HandleFunc("/debug/pprof/", pprof.Index)
|
||||
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
|
||||
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
|
||||
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
|
||||
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
|
||||
|
||||
addr := net.JoinHostPort(npdo.ServerAddress, strconv.Itoa(npdo.ServerPort))
|
||||
go func() {
|
||||
err := http.ListenAndServe(addr, mux)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to start server: %v", err)
|
||||
klog.Fatalf("Failed to start server: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) {
|
||||
func waitForAPIServerReadyWithTimeout(ctx context.Context, c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollUntilContextTimeout(ctx, npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, true, func(ctx context.Context) (done bool, err error) {
|
||||
// If NPD can get the node object from kube-apiserver, the server is
|
||||
// ready and the RBAC permission is set correctly.
|
||||
if _, err := c.GetNode(); err == nil {
|
||||
return true, nil
|
||||
if _, err := c.GetNode(ctx); err != nil {
|
||||
klog.Errorf("Can't get node object: %v", err)
|
||||
return false, err
|
||||
}
|
||||
return false, nil
|
||||
return true, nil
|
||||
})
|
||||
}
|
||||
|
|
|
@ -12,12 +12,12 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package kubernetes
|
||||
package problemclient
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
|
@ -57,7 +57,7 @@ func getConfigOverrides(uri *url.URL) (*kubeClientCmd.ConfigOverrides, error) {
|
|||
return &kubeConfigOverride, nil
|
||||
}
|
||||
|
||||
func GetKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
||||
func getKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
||||
var (
|
||||
kubeConfig *kube_rest.Config
|
||||
err error
|
||||
|
@ -137,7 +137,7 @@ func GetKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
|||
|
||||
if useServiceAccount {
|
||||
// If a readable service account token exists, then use it
|
||||
if contents, err := ioutil.ReadFile(defaultServiceAccountFile); err == nil {
|
||||
if contents, err := os.ReadFile(defaultServiceAccountFile); err == nil {
|
||||
kubeConfig.BearerToken = string(contents)
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
@ -60,7 +61,7 @@ func (f *FakeProblemClient) AssertConditions(expected []v1.NodeCondition) error
|
|||
}
|
||||
|
||||
// SetConditions is a fake mimic of SetConditions, it only update the internal condition cache.
|
||||
func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
|
||||
func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.NodeCondition) error {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["SetConditions"]; ok {
|
||||
|
@ -73,7 +74,7 @@ func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
|
|||
}
|
||||
|
||||
// GetConditions is a fake mimic of GetConditions, it returns the conditions cached internally.
|
||||
func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["GetConditions"]; ok {
|
||||
|
@ -93,6 +94,6 @@ func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.N
|
|||
func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) {
|
||||
}
|
||||
|
||||
func (f *FakeProblemClient) GetNode() (*v1.Node, error) {
|
||||
func (f *FakeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
return nil, fmt.Errorf("GetNode() not implemented")
|
||||
}
|
||||
|
|
|
@ -17,24 +17,24 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/heapster/common/kubernetes"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
@ -42,14 +42,14 @@ import (
|
|||
// Client is the interface of problem client
|
||||
type Client interface {
|
||||
// GetConditions get all specific conditions of current node.
|
||||
GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
// SetConditions set or update conditions of current node.
|
||||
SetConditions(conditions []v1.NodeCondition) error
|
||||
SetConditions(ctx context.Context, conditionTypes []v1.NodeCondition) error
|
||||
// Eventf reports the event.
|
||||
Eventf(eventType string, source, reason, messageFmt string, args ...interface{})
|
||||
// GetNode returns the Node object of the node on which the
|
||||
// node-problem-detector runs.
|
||||
GetNode() (*v1.Node, error)
|
||||
GetNode(ctx context.Context) (*v1.Node, error)
|
||||
}
|
||||
|
||||
type nodeProblemClient struct {
|
||||
|
@ -68,13 +68,14 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
// we have checked it is a valid URI after command line argument is parsed.:)
|
||||
uri, _ := url.Parse(npdo.ApiServerOverride)
|
||||
|
||||
cfg, err := kubernetes.GetKubeClientConfig(uri)
|
||||
cfg, err := getKubeClientConfig(uri)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version())
|
||||
// TODO(random-liu): Set QPS Limit
|
||||
cfg.QPS = npdo.QPS
|
||||
cfg.Burst = npdo.Burst
|
||||
c.client = clientset.NewForConfigOrDie(cfg).CoreV1()
|
||||
c.nodeName = npdo.NodeName
|
||||
c.eventNamespace = npdo.EventNamespace
|
||||
|
@ -83,8 +84,8 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
return c
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode()
|
||||
func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -99,7 +100,7 @@ func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType)
|
|||
return conditions, nil
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) error {
|
||||
func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v1.NodeCondition) error {
|
||||
for i := range newConditions {
|
||||
// Each time we update the conditions, we update the heart beat time
|
||||
newConditions[i].LastHeartbeatTime = metav1.NewTime(c.clock.Now())
|
||||
|
@ -108,7 +109,15 @@ func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) erro
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.client.RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(c.nodeName).SubResource("status").Body(patch).Do().Error()
|
||||
return retry.OnError(retry.DefaultRetry,
|
||||
func(error) bool {
|
||||
return true
|
||||
},
|
||||
func() error {
|
||||
_, err := c.client.Nodes().PatchStatus(ctx, c.nodeName, patch)
|
||||
return err
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, args ...interface{}) {
|
||||
|
@ -121,8 +130,10 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string,
|
|||
recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...)
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetNode() (*v1.Node, error) {
|
||||
return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{})
|
||||
func (c *nodeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
// To reduce the load on APIServer & etcd, we are serving GET operations from
|
||||
// apiserver cache (the data might be slightly delayed).
|
||||
return c.client.Nodes().Get(ctx, c.nodeName, metav1.GetOptions{ResourceVersion: "0"})
|
||||
}
|
||||
|
||||
// generatePatch generates condition patch
|
||||
|
@ -137,8 +148,8 @@ func generatePatch(conditions []v1.NodeCondition) ([]byte, error) {
|
|||
// getEventRecorder generates a recorder for specific node name and source.
|
||||
func getEventRecorder(c typedcorev1.CoreV1Interface, namespace, nodeName, source string) record.EventRecorder {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
eventBroadcaster.StartLogging(glog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartLogging(klog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(runtime.NewScheme(), v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events(namespace)})
|
||||
return recorder
|
||||
}
|
||||
|
|
|
@ -22,10 +22,10 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/client-go/tools/record"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
@ -40,7 +40,7 @@ func newFakeProblemClient() *nodeProblemClient {
|
|||
nodeName: testNode,
|
||||
// There is no proper fake for *client.Client for now
|
||||
// TODO(random-liu): Add test for SetConditions when we have good fake for *client.Client
|
||||
clock: &clock.FakeClock{},
|
||||
clock: testclock.NewFakeClock(time.Now()),
|
||||
recorders: make(map[string]record.EventRecorder),
|
||||
nodeRef: getNodeRef("", testNode),
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import (
|
|||
"strconv"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/prometheus"
|
||||
"github.com/golang/glog"
|
||||
"go.opencensus.io/stats/view"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -40,13 +40,13 @@ func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
|||
addr := net.JoinHostPort(npdo.PrometheusServerAddress, strconv.Itoa(npdo.PrometheusServerPort))
|
||||
pe, err := prometheus.NewExporter(prometheus.Options{})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
klog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
}
|
||||
go func() {
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", pe)
|
||||
if err := http.ListenAndServe(addr, mux); err != nil {
|
||||
glog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
klog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
}
|
||||
}()
|
||||
view.RegisterExporter(pe)
|
||||
|
|
|
@ -18,7 +18,7 @@ package gce
|
|||
|
||||
import (
|
||||
"cloud.google.com/go/compute/metadata"
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
type Metadata struct {
|
||||
|
@ -37,7 +37,7 @@ func (md *Metadata) HasMissingField() bool {
|
|||
|
||||
func (md *Metadata) PopulateFromGCE() error {
|
||||
var err error
|
||||
glog.Info("Fetching GCE metadata from metadata server")
|
||||
klog.Info("Fetching GCE metadata from metadata server")
|
||||
if md.ProjectID == "" {
|
||||
md.ProjectID, err = metadata.ProjectID()
|
||||
if err != nil {
|
||||
|
|
|
@ -18,19 +18,19 @@ package stackdriverexporter
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/stackdriver"
|
||||
monitoredres "contrib.go.opencensus.io/exporter/stackdriver/monitoredresource"
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"go.opencensus.io/stats/view"
|
||||
"google.golang.org/api/option"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"github.com/avast/retry-go"
|
||||
"github.com/avast/retry-go/v4"
|
||||
"k8s.io/node-problem-detector/pkg/exporters"
|
||||
seconfig "k8s.io/node-problem-detector/pkg/exporters/stackdriver/config"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -54,6 +54,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
|||
metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m",
|
||||
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
|
||||
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
|
||||
metrics.DiskPercentUsedID: "compute.googleapis.com/guest/disk/percent_used",
|
||||
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
|
||||
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
|
||||
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
|
||||
|
@ -66,6 +67,7 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
|||
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
|
||||
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
|
||||
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
|
||||
metrics.MemoryPercentUsedID: "compute.googleapis.com/guest/memory/percent_used",
|
||||
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
|
||||
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
|
||||
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
|
||||
|
@ -137,12 +139,12 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
DefaultMonitoringLabels: &globalLabels,
|
||||
})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
klog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
}
|
||||
|
||||
exportPeriod, err := time.ParseDuration(se.config.ExportPeriod)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
klog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
}
|
||||
|
||||
view.SetReportingPeriod(exportPeriod)
|
||||
|
@ -151,33 +153,33 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
|
||||
func (se *stackdriverExporter) populateMetadataOrDie() {
|
||||
if !se.config.GCEMetadata.HasMissingField() {
|
||||
glog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
klog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
|
||||
metadataFetchTimeout, err := time.ParseDuration(se.config.MetadataFetchTimeout)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
klog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
}
|
||||
|
||||
metadataFetchInterval, err := time.ParseDuration(se.config.MetadataFetchInterval)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
klog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
}
|
||||
|
||||
glog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
klog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
err = retry.Do(se.config.GCEMetadata.PopulateFromGCE,
|
||||
retry.Delay(metadataFetchInterval),
|
||||
retry.Attempts(uint(metadataFetchTimeout/metadataFetchInterval)),
|
||||
retry.DelayType(retry.FixedDelay))
|
||||
if err == nil {
|
||||
glog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
klog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
if se.config.PanicOnMetadataFetchFailure {
|
||||
glog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
klog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
} else {
|
||||
glog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
klog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -200,7 +202,7 @@ func (clo *commandLineOptions) SetFlags(fs *pflag.FlagSet) {
|
|||
func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
||||
options, ok := clo.(*commandLineOptions)
|
||||
if !ok {
|
||||
glog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
klog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
}
|
||||
if options.configPath == "" {
|
||||
return nil
|
||||
|
@ -209,17 +211,17 @@ func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
|||
se := stackdriverExporter{}
|
||||
|
||||
// Apply configurations.
|
||||
f, err := ioutil.ReadFile(options.configPath)
|
||||
f, err := os.ReadFile(options.configPath)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &se.config)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
se.config.ApplyConfiguration()
|
||||
|
||||
glog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
klog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
|
||||
se.populateMetadataOrDie()
|
||||
se.setupOpenCensusViewExporterOrDie()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,9 +17,13 @@ limitations under the License.
|
|||
package healthchecker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
@ -36,6 +40,7 @@ type healthChecker struct {
|
|||
crictlPath string
|
||||
healthCheckTimeout time.Duration
|
||||
coolDownTime time.Duration
|
||||
loopBackTime time.Duration
|
||||
logPatternsToCheck map[string]int
|
||||
}
|
||||
|
||||
|
@ -48,6 +53,7 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
|
|||
healthCheckTimeout: hco.HealthCheckTimeout,
|
||||
coolDownTime: hco.CoolDownTime,
|
||||
service: hco.Service,
|
||||
loopBackTime: hco.LoopBackTime,
|
||||
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
|
||||
}
|
||||
hc.healthCheckFunc = getHealthCheckFunc(hco)
|
||||
|
@ -63,24 +69,26 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
|
|||
if err != nil {
|
||||
return healthy, err
|
||||
}
|
||||
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.logPatternsToCheck)
|
||||
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.loopBackTime, hc.logPatternsToCheck)
|
||||
if err != nil {
|
||||
return logPatternHealthy, err
|
||||
}
|
||||
if healthy && logPatternHealthy {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// The service is unhealthy.
|
||||
// Attempt repair based on flag.
|
||||
if hc.enableRepair {
|
||||
// repair if the service has been up for the cool down period.
|
||||
uptime, err := hc.uptimeFunc()
|
||||
if err != nil {
|
||||
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
klog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
return false, nil
|
||||
}
|
||||
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
klog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
if uptime > hc.coolDownTime {
|
||||
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
klog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
hc.repairFunc()
|
||||
}
|
||||
}
|
||||
|
@ -89,18 +97,21 @@ func (hc *healthChecker) CheckHealth() (bool, error) {
|
|||
|
||||
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
|
||||
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
|
||||
func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (bool, error) {
|
||||
func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatternsToCheck map[string]int) (bool, error) {
|
||||
if len(logPatternsToCheck) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
uptimeFunc := getUptimeFunc(service)
|
||||
klog.Infof("Getting uptime for service: %v\n", service)
|
||||
uptime, err := uptimeFunc()
|
||||
if err != nil {
|
||||
klog.Warningf("Failed to get the uptime: %+v", err)
|
||||
return true, err
|
||||
}
|
||||
|
||||
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
|
||||
if err != nil {
|
||||
return true, err
|
||||
if loopBackTime > 0 && uptime > loopBackTime {
|
||||
logStartTime = time.Now().Add(-loopBackTime).Format(types.LogParsingTimeLayout)
|
||||
}
|
||||
for pattern, count := range logPatternsToCheck {
|
||||
healthy, err := checkForPattern(service, logStartTime, pattern, count)
|
||||
|
@ -110,3 +121,65 @@ func logPatternHealthCheck(service string, logPatternsToCheck map[string]int) (b
|
|||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
|
||||
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: timeout}
|
||||
response, err := httpClient.Get(endpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.KubeProxyComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, getDockerPath(), "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
_, err := execCommand(
|
||||
hco.HealthCheckTimeout,
|
||||
hco.CriCtlPath,
|
||||
"--timeout="+hco.CriTimeout.String(),
|
||||
"--runtime-endpoint="+hco.CriSocketPath,
|
||||
"pods",
|
||||
"--latest",
|
||||
)
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
default:
|
||||
klog.Warningf("Unsupported component: %v", hco.Component)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
klog.Infof("command %v failed: %v, %s\n", cmd, err, string(out))
|
||||
return "", err
|
||||
}
|
||||
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
)
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
klog.Fatalf("getUptimeFunc is not supported in %s", runtime.GOOS)
|
||||
return func() (time.Duration, error) { return time.Second, nil }
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
klog.Fatalf("getRepairFunc is not supported in %s", runtime.GOOS)
|
||||
return func() {}
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
klog.Fatalf("checkForPattern is not supported in %s", runtime.GOOS)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
klog.Fatalf("getDockerPath is not supported in %s", runtime.GOOS)
|
||||
return ""
|
||||
}
|
|
@ -17,15 +17,12 @@ limitations under the License.
|
|||
package healthchecker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
|
@ -59,6 +56,11 @@ func getUptimeFunc(service string) func() (time.Duration, error) {
|
|||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
// Use `systemctl kill` instead of `systemctl restart` for the repair function.
|
||||
// We start to rely on the kernel message difference for the two commands to
|
||||
// indicate if the component restart is due to an administrative plan (restart)
|
||||
// or a system issue that needs repair (kill).
|
||||
// See https://github.com/kubernetes/node-problem-detector/issues/847.
|
||||
switch hco.Component {
|
||||
case types.DockerComponent:
|
||||
// Use "docker ps" for docker health check. Not using crictl for docker to remove
|
||||
|
@ -75,49 +77,6 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
|||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
|
@ -136,8 +95,12 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
|
|||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker"
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
|
@ -119,3 +120,38 @@ func TestHealthCheck(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestComponentsSupported(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
description string
|
||||
component string
|
||||
}{
|
||||
{
|
||||
description: "Kube Proxy should be supported",
|
||||
component: types.KubeProxyComponent,
|
||||
},
|
||||
{
|
||||
description: "Kubelet should be supported",
|
||||
component: types.KubeletComponent,
|
||||
},
|
||||
{
|
||||
description: "Docker should be supported",
|
||||
component: types.DockerComponent,
|
||||
},
|
||||
{
|
||||
description: "CRI should be supported",
|
||||
component: types.CRIComponent,
|
||||
},
|
||||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
checkFunc := getHealthCheckFunc(&options.HealthCheckerOptions{
|
||||
Component: tc.component,
|
||||
})
|
||||
if checkFunc == nil {
|
||||
t.Errorf("component %v should be supported", tc.component)
|
||||
}
|
||||
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,13 +18,12 @@ package healthchecker
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
|
@ -34,12 +33,19 @@ import (
|
|||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
// Using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
|
||||
// To attempt to calculate uptime more efficiently, we attempt to grab the process id to grab the start time.
|
||||
// If the process id does not exist (meaning the service is not running for some reason), we will result to
|
||||
// using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
|
||||
// In addition to filtering not by the logname=system we also filter on event id=7036 to reduce the number of
|
||||
// entries the next command Where-Object will have to look through. id 7036 messages indicating a stopped or running service.
|
||||
// The powershell command formats the TimeCreated of the event log in RFC1123Pattern.
|
||||
// However, because the time library parser does not recognize the ',' in this RFC1123Pattern format,
|
||||
// it is manually removed before parsing it using the UptimeTimeLayout.
|
||||
getTimeCreatedCmd := "(Get-WinEvent -Logname System | Where-Object {$_.Message -Match '.*(" + service +
|
||||
").*(running).*'} | Select-Object -Property TimeCreated -First 1 | foreach {$_.TimeCreated.ToString('R')} | Out-String).Trim()"
|
||||
getTimeCreatedCmd := `$ProcessId = (Get-WMIObject -Class Win32_Service -Filter "Name='` + service + `'" | Select-Object -ExpandProperty ProcessId);` +
|
||||
`if ([string]::IsNullOrEmpty($ProcessId) -or $ProcessId -eq 0) { (Get-WinEvent -FilterHashtable @{logname='system';id=7036} ` +
|
||||
`| Where-Object {$_.Message -match '.*(` + service + `).*(running).*'} | Select-Object -Property TimeCreated -First 1 | ` +
|
||||
`foreach {$_.TimeCreated.ToUniversalTime().ToString('R')} | Out-String).Trim() } else { (Get-Process -Id $ProcessId | Select starttime | ` +
|
||||
`foreach {$_.starttime.ToUniversalTime().ToString('R')} | Out-String).Trim() }`
|
||||
out, err := powershell(getTimeCreatedCmd)
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
|
@ -64,49 +70,6 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
|||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint, hco.HealthCheckTimeout)
|
||||
case types.KubeProxyComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint, hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand("docker.exe", "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
|
||||
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: timeout}
|
||||
response, err := httpClient.Get(endpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// execCommand creates a new process, executes the command, and returns the (output, error) from command.
|
||||
func execCommand(command string, args ...string) (string, error) {
|
||||
cmd := util.Exec(command, args...)
|
||||
return extractCommandOutput(cmd)
|
||||
}
|
||||
|
||||
// powershell executes the arguments in powershell process and returns (output, error) from command.
|
||||
func powershell(args ...string) (string, error) {
|
||||
cmd := util.Powershell(args...)
|
||||
|
@ -117,7 +80,7 @@ func powershell(args ...string) (string, error) {
|
|||
func extractCommandOutput(cmd *exec.Cmd) (string, error) {
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
klog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\r\n"), nil
|
||||
|
@ -138,8 +101,12 @@ func checkForPattern(service, logStartTime, logPattern string, logCountThreshold
|
|||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
glog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker.exe"
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ package types
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
@ -25,6 +27,8 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
DefaultLoopBackTime = 0 * time.Minute
|
||||
DefaultCriTimeout = 2 * time.Second
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
|
@ -36,12 +40,57 @@ const (
|
|||
ContainerdService = "containerd"
|
||||
KubeProxyComponent = "kube-proxy"
|
||||
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
KubeProxyHealthCheckEndpoint = "http://127.0.0.1:10256/healthz"
|
||||
|
||||
LogPatternFlagSeparator = ":"
|
||||
hostAddressKey = "HOST_ADDRESS"
|
||||
kubeletPortKey = "KUBELET_PORT"
|
||||
kubeProxyPortKey = "KUBEPROXY_PORT"
|
||||
|
||||
defaultHostAddress = "localhost"
|
||||
defaultKubeletPort = "10248"
|
||||
defaultKubeproxyPort = "10256"
|
||||
)
|
||||
|
||||
var (
|
||||
kubeletHealthCheckEndpoint string
|
||||
kubeProxyHealthCheckEndpoint string
|
||||
)
|
||||
|
||||
func init() {
|
||||
setKubeEndpoints()
|
||||
}
|
||||
|
||||
func setKubeEndpoints() {
|
||||
var o string
|
||||
|
||||
hostAddress := defaultHostAddress
|
||||
kubeletPort := defaultKubeletPort
|
||||
kubeProxyPort := defaultKubeproxyPort
|
||||
|
||||
o = os.Getenv(hostAddressKey)
|
||||
if o != "" {
|
||||
hostAddress = o
|
||||
}
|
||||
o = os.Getenv(kubeletPortKey)
|
||||
if o != "" {
|
||||
kubeletPort = o
|
||||
}
|
||||
o = os.Getenv(kubeProxyPortKey)
|
||||
if o != "" {
|
||||
kubeProxyPort = o
|
||||
}
|
||||
|
||||
kubeletHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeletPort))
|
||||
kubeProxyHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeProxyPort))
|
||||
|
||||
}
|
||||
|
||||
func KubeProxyHealthCheckEndpoint() string {
|
||||
return kubeProxyHealthCheckEndpoint
|
||||
}
|
||||
func KubeletHealthCheckEndpoint() string {
|
||||
return kubeletHealthCheckEndpoint
|
||||
}
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() (bool, error)
|
||||
}
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
|
||||
)
|
|
@ -98,3 +98,101 @@ func TestLogPatternFlag(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestKubeEndpointConfiguration(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
envConfig map[string]string
|
||||
expectedKubeletEndpoint string
|
||||
expectedKubeProxyEndpoint string
|
||||
}{
|
||||
{
|
||||
name: "no overrides supplied",
|
||||
envConfig: map[string]string{},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv4",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.5.4",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.5.4:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.5.4:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv6",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "80:f4:16::1",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://[80:f4:16::1]:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://[80:f4:16::1]:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS, KUBELET_PORT and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.10.1",
|
||||
"KUBELET_PORT": "12345",
|
||||
"KUBEPROXY_PORT": "12346",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.10.1:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.10.1:12346/healthz",
|
||||
},
|
||||
}
|
||||
for _, test := range testCases {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
for key, val := range test.envConfig {
|
||||
t.Setenv(key, val)
|
||||
}
|
||||
setKubeEndpoints()
|
||||
|
||||
kubeProxyHCEndpoint := KubeProxyHealthCheckEndpoint()
|
||||
kubeletHCEndpoint := KubeletHealthCheckEndpoint()
|
||||
|
||||
assert.Equal(t, test.expectedKubeProxyEndpoint, kubeProxyHCEndpoint)
|
||||
assert.Equal(t, test.expectedKubeletEndpoint, kubeletHCEndpoint)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
|
||||
)
|
|
@ -17,7 +17,7 @@ limitations under the License.
|
|||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "C:/node/crictl.exe"
|
||||
DefaultCriCtl = "C:/etc/kubernetes/node/bin/crictl.exe"
|
||||
DefaultCriSocketPath = "npipe:////./pipe/containerd-containerd"
|
||||
UptimeTimeLayout = "Mon 02 Jan 2006 15:04:05 MST"
|
||||
LogParsingTimeFormat = "yyyy-MM-dd HH:mm:ss"
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -22,7 +23,7 @@ import (
|
|||
"fmt"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/logcounter/options"
|
||||
"k8s.io/node-problem-detector/pkg/logcounter/types"
|
||||
|
@ -39,10 +40,11 @@ const (
|
|||
)
|
||||
|
||||
type logCounter struct {
|
||||
logCh <-chan *systemtypes.Log
|
||||
buffer systemlogmonitor.LogBuffer
|
||||
pattern string
|
||||
clock clock.Clock
|
||||
logCh <-chan *systemtypes.Log
|
||||
buffer systemlogmonitor.LogBuffer
|
||||
pattern string
|
||||
revertPattern string
|
||||
clock clock.Clock
|
||||
}
|
||||
|
||||
func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
|
||||
|
@ -58,10 +60,11 @@ func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter
|
|||
return nil, fmt.Errorf("error watching journald: %v", err)
|
||||
}
|
||||
return &logCounter{
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
pattern: options.Pattern,
|
||||
clock: clock.RealClock{},
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
pattern: options.Pattern,
|
||||
revertPattern: options.RevertPattern,
|
||||
clock: clock.RealClock{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -83,6 +86,9 @@ func (e *logCounter) Count() (count int, err error) {
|
|||
if len(e.buffer.Match(e.pattern)) != 0 {
|
||||
count++
|
||||
}
|
||||
if e.revertPattern != "" && len(e.buffer.Match(e.revertPattern)) != 0 {
|
||||
count--
|
||||
}
|
||||
case <-e.clock.After(timeout):
|
||||
// Don't block forever if we do not get any new messages
|
||||
return
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -22,16 +23,16 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/logcounter/types"
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
|
||||
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
)
|
||||
|
||||
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *clock.FakeClock, chan *systemtypes.Log) {
|
||||
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *testclock.FakeClock, chan *systemtypes.Log) {
|
||||
logCh := make(chan *systemtypes.Log)
|
||||
clock := clock.NewFakeClock(startTime)
|
||||
clock := testclock.NewFakeClock(startTime)
|
||||
return &logCounter{
|
||||
logCh: logCh,
|
||||
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
|
||||
|
|
|
@ -19,7 +19,7 @@ package problemdaemon
|
|||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
@ -58,7 +58,7 @@ func NewProblemDaemons(monitorConfigPaths types.ProblemDaemonConfigPathMap) []ty
|
|||
for _, config := range *configs {
|
||||
if _, ok := problemDaemonMap[config]; ok {
|
||||
// Skip the config if it's duplicated.
|
||||
glog.Warningf("Duplicated problem daemon configuration %q", config)
|
||||
klog.Warningf("Duplicated problem daemon configuration %q", config)
|
||||
continue
|
||||
}
|
||||
problemDaemonMap[config] = handlers[problemDaemonType].CreateProblemDaemonOrDie(config)
|
||||
|
|
|
@ -17,16 +17,17 @@ limitations under the License.
|
|||
package problemdetector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
||||
// ProblemDetector collects statuses from all problem daemons and update the node condition and send node event.
|
||||
type ProblemDetector interface {
|
||||
Run(termCh <-chan error) error
|
||||
Run(context.Context) error
|
||||
}
|
||||
|
||||
type problemDetector struct {
|
||||
|
@ -44,7 +45,7 @@ func NewProblemDetector(monitors []types.Monitor, exporters []types.Exporter) Pr
|
|||
}
|
||||
|
||||
// Run starts the problem detector.
|
||||
func (p *problemDetector) Run(termCh <-chan error) error {
|
||||
func (p *problemDetector) Run(ctx context.Context) error {
|
||||
// Start the log monitors one by one.
|
||||
var chans []<-chan *types.Status
|
||||
failureCount := 0
|
||||
|
@ -52,7 +53,7 @@ func (p *problemDetector) Run(termCh <-chan error) error {
|
|||
ch, err := m.Start()
|
||||
if err != nil {
|
||||
// Do not return error and keep on trying the following config files.
|
||||
glog.Errorf("Failed to start problem daemon %v: %v", m, err)
|
||||
klog.Errorf("Failed to start problem daemon %v: %v", m, err)
|
||||
failureCount++
|
||||
continue
|
||||
}
|
||||
|
@ -73,11 +74,11 @@ func (p *problemDetector) Run(termCh <-chan error) error {
|
|||
}()
|
||||
|
||||
ch := groupChannel(chans)
|
||||
glog.Info("Problem detector started")
|
||||
klog.Info("Problem detector started")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-termCh:
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case status := <-ch:
|
||||
for _, exporter := range p.exporters {
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package problemdetector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -24,7 +25,7 @@ import (
|
|||
|
||||
func TestEmpty(t *testing.T) {
|
||||
pd := NewProblemDetector([]types.Monitor{}, []types.Exporter{})
|
||||
if err := pd.Run(nil); err == nil {
|
||||
if err := pd.Run(context.Background()); err == nil {
|
||||
t.Error("expected error when running an empty problem detector")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ import (
|
|||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
)
|
||||
|
@ -56,7 +56,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
|
|||
metrics.Sum,
|
||||
[]string{"reason"})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create problem_counter metric: %v", err)
|
||||
klog.Fatalf("Failed to create problem_counter metric: %v", err)
|
||||
}
|
||||
|
||||
pmm.problemGauge, err = metrics.NewInt64Metric(
|
||||
|
@ -67,7 +67,7 @@ func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
|
|||
metrics.LastValue,
|
||||
[]string{"type", "reason"})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create problem_gauge metric: %v", err)
|
||||
klog.Fatalf("Failed to create problem_gauge metric: %v", err)
|
||||
}
|
||||
|
||||
pmm.problemTypeToReason = make(map[string]string)
|
||||
|
|
|
@ -37,7 +37,8 @@ with new rule definition:
|
|||
"type": "temporary/permanent",
|
||||
"condition": "NodeConditionOfPermanentIssue",
|
||||
"reason": "CamelCaseShortReason",
|
||||
"message": "regexp matching the issue in the log"
|
||||
"pattern": "regexp matching the issue in the log",
|
||||
"patternGeneratedMessageSuffix": "Please check the network connectivity and ensure that all required services are running. For more details, see our documentation at https://example.com/docs/troubleshooting."
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ type MonitorConfig struct {
|
|||
EnableMetricsReporting *bool `json:"metricsReporting,omitempty"`
|
||||
}
|
||||
|
||||
// ApplyConfiguration applies default configurations.
|
||||
// ApplyDefaultConfiguration applies default configurations.
|
||||
func (mc *MonitorConfig) ApplyDefaultConfiguration() {
|
||||
if mc.BufferSize == 0 {
|
||||
mc.BufferSize = defaultBufferSize
|
||||
|
|
|
@ -18,16 +18,16 @@ package systemlogmonitor
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/problemdaemon"
|
||||
"k8s.io/node-problem-detector/pkg/problemmetrics"
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers"
|
||||
watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
|
@ -50,7 +50,7 @@ type logMonitor struct {
|
|||
buffer LogBuffer
|
||||
config MonitorConfig
|
||||
conditions []types.Condition
|
||||
logCh <-chan *logtypes.Log
|
||||
logCh <-chan *systemlogtypes.Log
|
||||
output chan *types.Status
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
@ -62,21 +62,21 @@ func NewLogMonitorOrDie(configPath string) types.Monitor {
|
|||
tomb: tomb.NewTomb(),
|
||||
}
|
||||
|
||||
f, err := ioutil.ReadFile(configPath)
|
||||
f, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &l.config)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
}
|
||||
// Apply default configurations
|
||||
(&l.config).ApplyDefaultConfiguration()
|
||||
err = l.config.ValidateRules()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
|
||||
klog.Fatalf("Failed to validate %s matching rules %+v: %v", l.configPath, l.config.Rules, err)
|
||||
}
|
||||
glog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
|
||||
klog.Infof("Finish parsing log monitor config file %s: %+v", l.configPath, l.config)
|
||||
|
||||
l.watcher = logwatchers.GetLogWatcherOrDie(l.config.WatcherConfig)
|
||||
l.buffer = NewLogBuffer(l.config.BufferSize)
|
||||
|
@ -96,19 +96,19 @@ func initializeProblemMetricsOrDie(rules []systemlogtypes.Rule) {
|
|||
if rule.Type == types.Perm {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
rule.Condition, rule.Reason, err)
|
||||
}
|
||||
}
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *logMonitor) Start() (<-chan *types.Status, error) {
|
||||
glog.Infof("Start log monitor %s", l.configPath)
|
||||
klog.Infof("Start log monitor %s", l.configPath)
|
||||
var err error
|
||||
l.logCh, err = l.watcher.Watch()
|
||||
if err != nil {
|
||||
|
@ -119,7 +119,7 @@ func (l *logMonitor) Start() (<-chan *types.Status, error) {
|
|||
}
|
||||
|
||||
func (l *logMonitor) Stop() {
|
||||
glog.Infof("Stop log monitor %s", l.configPath)
|
||||
klog.Infof("Stop log monitor %s", l.configPath)
|
||||
l.tomb.Stop()
|
||||
}
|
||||
|
||||
|
@ -134,20 +134,20 @@ func (l *logMonitor) monitorLoop() {
|
|||
select {
|
||||
case log, ok := <-l.logCh:
|
||||
if !ok {
|
||||
glog.Errorf("Log channel closed: %s", l.configPath)
|
||||
klog.Errorf("Log channel closed: %s", l.configPath)
|
||||
return
|
||||
}
|
||||
l.parseLog(log)
|
||||
case <-l.tomb.Stopping():
|
||||
l.watcher.Stop()
|
||||
glog.Infof("Log monitor stopped: %s", l.configPath)
|
||||
klog.Infof("Log monitor stopped: %s", l.configPath)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseLog parses one log line.
|
||||
func (l *logMonitor) parseLog(log *logtypes.Log) {
|
||||
func (l *logMonitor) parseLog(log *systemlogtypes.Log) {
|
||||
// Once there is new log, log monitor will push it into the log buffer and try
|
||||
// to match each rule. If any rule is matched, log monitor will report a status.
|
||||
l.buffer.Push(log)
|
||||
|
@ -157,16 +157,16 @@ func (l *logMonitor) parseLog(log *logtypes.Log) {
|
|||
continue
|
||||
}
|
||||
status := l.generateStatus(matched, rule)
|
||||
glog.Infof("New status generated: %+v", status)
|
||||
klog.Infof("New status generated: %+v", status)
|
||||
l.output <- status
|
||||
}
|
||||
}
|
||||
|
||||
// generateStatus generates status from the logs.
|
||||
func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Rule) *types.Status {
|
||||
func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogtypes.Rule) *types.Status {
|
||||
// We use the timestamp of the first log line as the timestamp of the status.
|
||||
timestamp := logs[0].Timestamp
|
||||
message := generateMessage(logs)
|
||||
message := generateMessage(logs, rule.PatternGeneratedMessageSuffix)
|
||||
var events []types.Event
|
||||
var changedConditions []*types.Condition
|
||||
if rule.Type == types.Temp {
|
||||
|
@ -192,6 +192,7 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
|
|||
condition.Type,
|
||||
types.True,
|
||||
rule.Reason,
|
||||
message,
|
||||
timestamp,
|
||||
))
|
||||
}
|
||||
|
@ -207,14 +208,14 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
|
|||
for _, event := range events {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(event.Reason, 1)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
|
||||
klog.Errorf("Failed to update problem counter metrics for %q: %v", event.Reason, err)
|
||||
}
|
||||
}
|
||||
for _, condition := range changedConditions {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
|
||||
condition.Type, condition.Reason, condition.Status == types.True)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
condition.Type, condition.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -232,7 +233,7 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
|
|||
func (l *logMonitor) initializeStatus() {
|
||||
// Initialize the default node conditions
|
||||
l.conditions = initialConditions(l.config.DefaultConditions)
|
||||
glog.Infof("Initialize condition generated: %+v", l.conditions)
|
||||
klog.Infof("Initialize condition generated: %+v", l.conditions)
|
||||
// Update the initial status
|
||||
l.output <- &types.Status{
|
||||
Source: l.config.Source,
|
||||
|
@ -250,10 +251,14 @@ func initialConditions(defaults []types.Condition) []types.Condition {
|
|||
return conditions
|
||||
}
|
||||
|
||||
func generateMessage(logs []*logtypes.Log) string {
|
||||
func generateMessage(logs []*systemlogtypes.Log, patternGeneratedMessageSuffix string) string {
|
||||
messages := []string{}
|
||||
for _, log := range logs {
|
||||
messages = append(messages, log.Message)
|
||||
}
|
||||
return concatLogs(messages)
|
||||
logMessage := concatLogs(messages)
|
||||
if patternGeneratedMessageSuffix != "" {
|
||||
return fmt.Sprintf("%s; %s", logMessage, patternGeneratedMessageSuffix)
|
||||
}
|
||||
return logMessage
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/problemdaemon"
|
||||
"k8s.io/node-problem-detector/pkg/problemmetrics"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
systemlogtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
|
@ -84,6 +85,7 @@ func TestGenerateStatusForConditions(t *testing.T) {
|
|||
testConditionA,
|
||||
types.True,
|
||||
"test reason",
|
||||
"test message 1\ntest message 2",
|
||||
time.Unix(1000, 1000),
|
||||
)},
|
||||
Conditions: []types.Condition{
|
||||
|
@ -698,3 +700,40 @@ func TestInitializeProblemMetricsOrDie(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateMessage(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
logs []*systemlogtypes.Log
|
||||
patternGeneratedMessageSuffix string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "No rule message",
|
||||
logs: []*systemlogtypes.Log{
|
||||
{Message: "First log message"},
|
||||
{Message: "Second log message"},
|
||||
},
|
||||
patternGeneratedMessageSuffix: "",
|
||||
want: "First log message\nSecond log message",
|
||||
},
|
||||
{
|
||||
name: "With rule message",
|
||||
logs: []*systemlogtypes.Log{
|
||||
{Message: "First log message"},
|
||||
{Message: "Second log message"},
|
||||
},
|
||||
patternGeneratedMessageSuffix: "refer www.foo.com/docs for playbook on how to fix the issue",
|
||||
want: "First log message\nSecond log message; refer www.foo.com/docs for playbook on how to fix the issue",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := generateMessage(tt.logs, tt.patternGeneratedMessageSuffix)
|
||||
if got != tt.want {
|
||||
t.Errorf("generateMessage() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
utilclock "code.cloudfoundry.org/clock"
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
|
||||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
|
@ -40,7 +39,6 @@ type filelogWatcher struct {
|
|||
logCh chan *logtypes.Log
|
||||
startTime time.Time
|
||||
tomb *tomb.Tomb
|
||||
clock utilclock.Clock
|
||||
}
|
||||
|
||||
// NewSyslogWatcherOrDie creates a new log watcher. The function panics
|
||||
|
@ -48,11 +46,11 @@ type filelogWatcher struct {
|
|||
func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
|
||||
uptime, err := util.GetUptimeDuration()
|
||||
if err != nil {
|
||||
glog.Fatalf("failed to get uptime: %v", err)
|
||||
klog.Fatalf("failed to get uptime: %v", err)
|
||||
}
|
||||
startTime, err := util.GetStartTime(time.Now(), uptime, cfg.Lookback, cfg.Delay)
|
||||
if err != nil {
|
||||
glog.Fatalf("failed to get start time: %v", err)
|
||||
klog.Fatalf("failed to get start time: %v", err)
|
||||
}
|
||||
|
||||
return &filelogWatcher{
|
||||
|
@ -62,7 +60,6 @@ func NewSyslogWatcherOrDie(cfg types.WatcherConfig) types.LogWatcher {
|
|||
tomb: tomb.NewTomb(),
|
||||
// A capacity 1000 buffer should be enough
|
||||
logCh: make(chan *logtypes.Log, 1000),
|
||||
clock: utilclock.NewClock(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -77,7 +74,7 @@ func (s *filelogWatcher) Watch() (<-chan *logtypes.Log, error) {
|
|||
}
|
||||
s.reader = bufio.NewReader(r)
|
||||
s.closer = r
|
||||
glog.Info("Start watching filelog")
|
||||
klog.Info("Start watching filelog")
|
||||
go s.watchLoop()
|
||||
return s.logCh, nil
|
||||
}
|
||||
|
@ -102,14 +99,14 @@ func (s *filelogWatcher) watchLoop() {
|
|||
for {
|
||||
select {
|
||||
case <-s.tomb.Stopping():
|
||||
glog.Infof("Stop watching filelog")
|
||||
klog.Infof("Stop watching filelog")
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
line, err := s.reader.ReadString('\n')
|
||||
if err != nil && err != io.EOF {
|
||||
glog.Errorf("Exiting filelog watch with error: %v", err)
|
||||
klog.Errorf("Exiting filelog watch with error: %v", err)
|
||||
return
|
||||
}
|
||||
buffer.WriteString(line)
|
||||
|
@ -119,16 +116,28 @@ func (s *filelogWatcher) watchLoop() {
|
|||
}
|
||||
line = buffer.String()
|
||||
buffer.Reset()
|
||||
if s.filterSkipList(line) {
|
||||
continue
|
||||
}
|
||||
log, err := s.translator.translate(strings.TrimSuffix(line, "\n"))
|
||||
if err != nil {
|
||||
glog.Warningf("Unable to parse line: %q, %v", line, err)
|
||||
klog.Warningf("Unable to parse line: %q, %v", line, err)
|
||||
continue
|
||||
}
|
||||
// Discard messages before start time.
|
||||
if log.Timestamp.Before(s.startTime) {
|
||||
glog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
|
||||
klog.V(5).Infof("Throwing away msg %q before start time: %v < %v", log.Message, log.Timestamp, s.startTime)
|
||||
continue
|
||||
}
|
||||
s.logCh <- log
|
||||
}
|
||||
}
|
||||
|
||||
func (s *filelogWatcher) filterSkipList(line string) bool {
|
||||
for _ , skipItem := range s.cfg.SkipList {
|
||||
if strings.Contains(line, skipItem) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package filelog
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/hpcloud/tail"
|
||||
)
|
||||
|
||||
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back
|
||||
// to the rolled out logs.
|
||||
func getLogReader(path string) (io.ReadCloser, error) {
|
||||
return tail.OpenFile(path)
|
||||
}
|
|
@ -19,9 +19,8 @@ package filelog
|
|||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"k8s.io/node-problem-detector/third_party/forked/cadvisor/tail"
|
||||
"os"
|
||||
|
||||
"github.com/google/cadvisor/utils/tail"
|
||||
)
|
||||
|
||||
// getLogReader returns log reader for filelog log. Note that getLogReader doesn't look back
|
||||
|
|
|
@ -17,7 +17,6 @@ limitations under the License.
|
|||
package filelog
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -26,8 +25,8 @@ import (
|
|||
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
"code.cloudfoundry.org/clock/fakeclock"
|
||||
"github.com/stretchr/testify/assert"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
)
|
||||
|
||||
// getTestPluginConfig returns a plugin config for test. Use configuration for
|
||||
|
@ -43,7 +42,7 @@ func getTestPluginConfig() map[string]string {
|
|||
func TestWatch(t *testing.T) {
|
||||
// now is a fake time
|
||||
now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local)
|
||||
fakeClock := fakeclock.NewFakeClock(now)
|
||||
fakeClock := testclock.NewFakeClock(now)
|
||||
testCases := []struct {
|
||||
uptime time.Duration
|
||||
lookback string
|
||||
|
@ -139,7 +138,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
for c, test := range testCases {
|
||||
t.Logf("TestCase #%d: %#v", c+1, test)
|
||||
f, err := ioutil.TempFile("", "log_watcher_test")
|
||||
f, err := os.CreateTemp("", "log_watcher_test")
|
||||
assert.NoError(t, err)
|
||||
defer func() {
|
||||
f.Close()
|
||||
|
@ -156,8 +155,6 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
})
|
||||
// Set the startTime.
|
||||
w.(*filelogWatcher).startTime, _ = util.GetStartTime(fakeClock.Now(), test.uptime, test.lookback, test.delay)
|
||||
// Set the fake clock.
|
||||
w.(*filelogWatcher).clock = fakeClock
|
||||
logCh, err := w.Watch()
|
||||
assert.NoError(t, err)
|
||||
defer w.Stop()
|
||||
|
@ -170,7 +167,7 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
}
|
||||
// The log channel should have already been drained
|
||||
// There could stil be future messages sent into the channel, but the chance is really slim.
|
||||
// There could still be future messages sent into the channel, but the chance is really slim.
|
||||
timeout := time.After(100 * time.Millisecond)
|
||||
select {
|
||||
case log := <-logCh:
|
||||
|
@ -179,3 +176,36 @@ Jan 2 03:04:05 kernel: [2.000000] 3
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterSkipList(t *testing.T) {
|
||||
s := &filelogWatcher{
|
||||
cfg: types.WatcherConfig{
|
||||
SkipList: []string{
|
||||
" audit:", " kubelet:",
|
||||
},
|
||||
},
|
||||
}
|
||||
testcase := []struct{
|
||||
log string
|
||||
expect bool
|
||||
}{
|
||||
{
|
||||
log: `Jan 2 03:04:03 kernel: [0.000000] 1`,
|
||||
expect: false,
|
||||
},
|
||||
{
|
||||
log: `Jan 2 03:04:04 audit: [1.000000] 2`,
|
||||
expect: true,
|
||||
},
|
||||
{
|
||||
log: `Jan 2 03:04:05 kubelet: [2.000000] 3`,
|
||||
expect: true,
|
||||
|
||||
},
|
||||
}
|
||||
for i, test := range testcase {
|
||||
if s.filterSkipList(test.log) != test.expect {
|
||||
t.Errorf("test case %d: expect %v but got %v", i, test.expect, s.filterSkipList(test.log))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue