Compare commits
626 Commits
Author | SHA1 | Date |
---|---|---|
|
dea6d70d46 | |
|
9d69c8e71a | |
|
6bbddb55de | |
|
cd3b7503bb | |
|
9b473a0e56 | |
|
a765aaecf7 | |
|
9e366f58cd | |
|
fc10031a7e | |
|
ca907dc101 | |
|
62223078ef | |
|
9fe113c522 | |
|
dc065c42f0 | |
|
8d4eb38a42 | |
|
0147098968 | |
|
308b7cfa4a | |
|
1721f9dbf7 | |
|
3f96666db7 | |
|
387571b357 | |
|
aede9d7e7f | |
|
0f1ee66855 | |
|
be0d387ec1 | |
|
78f51bf173 | |
|
87129900cf | |
|
59c46ad62c | |
|
4022575bf9 | |
|
f6bb4f7b55 | |
|
5562632053 | |
|
d6cfed982a | |
|
01e1cf033e | |
|
a099a5ed5c | |
|
5520e3df51 | |
|
c53e4f4308 | |
|
2a651d1f98 | |
|
e8b584ab52 | |
|
e858c3d1df | |
|
3bb752c25a | |
|
1ff64afbc9 | |
|
f69e7033e9 | |
|
f24ca57199 | |
|
32d7c72755 | |
|
2707945338 | |
|
c846b0ebaa | |
|
7039f066c7 | |
|
7ea55106c2 | |
|
3b92e70bc1 | |
|
4b9d196acd | |
|
cf267168c2 | |
|
416ec8b3c2 | |
|
7cb27449aa | |
|
186f0182b5 | |
|
628f021ffb | |
|
92597e574d | |
|
72f3041d2b | |
|
8d237a6c7c | |
|
c5f6fbc3d1 | |
|
12a8f5578c | |
|
d2cbde95e5 | |
|
66336e630a | |
|
93bc55b659 | |
|
72f1e1de7b | |
|
0a997f8116 | |
|
053539efd8 | |
|
cf0870fa12 | |
|
334a857fbe | |
|
f5433f460d | |
|
93e64ac709 | |
|
146ce4aa86 | |
|
d99fca5f0a | |
|
17d7588bff | |
|
26c77134bf | |
|
7d29a1c293 | |
|
3a8a07ad81 | |
|
cab30567cb | |
|
53f404dfed | |
|
b92aae803d | |
|
e8840b1a7d | |
|
29a98372ff | |
|
daaa07d690 | |
|
411cd7bd82 | |
|
d79c681e63 | |
|
45dde88c98 | |
|
b5ce184179 | |
|
2e15606dda | |
|
711760063a | |
|
2a3acd2669 | |
|
04173ee934 | |
|
f675d34e49 | |
|
de55c54059 | |
|
0d756b78fc | |
|
8b2ff03f5e | |
|
f4f5c479d9 | |
|
c1dd00d65c | |
|
68a97cf4cb | |
|
6c32180ce6 | |
|
7416db2236 | |
|
17dcc94418 | |
|
70e99e1e1f | |
|
53e0152f64 | |
|
16656c89f6 | |
|
0f4d8b96c5 | |
|
3a386a659e | |
|
b64f13f702 | |
|
2182ad0ddb | |
|
335e7e82ca | |
|
f392516a37 | |
|
c39f74def4 | |
|
be754653e6 | |
|
f0c5cd5d20 | |
|
ee955f9170 | |
|
3b91ca0c09 | |
|
13c44d92fd | |
|
56eb3dcb61 | |
|
e1d071ba63 | |
|
a22c0649f8 | |
|
57c97d2d47 | |
|
d83e1bcb53 | |
|
8f5c2e14fe | |
|
f1c1759ca0 | |
|
798610a11a | |
|
66fbb738fd | |
|
ae6fa3560e | |
|
35ffe05910 | |
|
ceee726210 | |
|
13a06ccad9 | |
|
68d08ac953 | |
|
dc4200d805 | |
|
a88792f4bd | |
|
e4fd02e9f1 | |
|
3e1bf74cda | |
|
a8973a8664 | |
|
3c43a0bd10 | |
|
9ed6527f0a | |
|
9694ee4354 | |
|
d6d4d93e4e | |
|
490faeace5 | |
|
f692ac3136 | |
|
c8659fb914 | |
|
200d46726c | |
|
3173ed132e | |
|
a22fe2a52f | |
|
c7befef47e | |
|
5f99c4d9b8 | |
|
ba355ee23f | |
|
ac9382a5c1 | |
|
c123dddac8 | |
|
daf4f4da3e | |
|
be9ba585dd | |
|
09c3cfe7ad | |
|
16921fe90f | |
|
289f11b28f | |
|
612199f0c6 | |
|
71a4f7a631 | |
|
1fbfdfd4f7 | |
|
5efc8884d1 | |
|
c0bccb7c76 | |
|
369020d878 | |
|
34fd4f8a8d | |
|
f0308d29b4 | |
|
4e0b9150b9 | |
|
34e60f82ec | |
|
7a48ce2e38 | |
|
69da591e38 | |
|
6c34d837ef | |
|
ecdccfb86c | |
|
132ccc8e81 | |
|
86750df7c2 | |
|
19c6f4db70 | |
|
e4ecee1976 | |
|
0dde605376 | |
|
09bbaa9c32 | |
|
f004190ea1 | |
|
7ee2a4dcda | |
|
f39c93e0f4 | |
|
8c22b69431 | |
|
030599e642 | |
|
66f9e5187f | |
|
5f59f438ac | |
|
0b89667d18 | |
|
338430f835 | |
|
a45f174cfc | |
|
273c3f5266 | |
|
b4623de861 | |
|
7d81d8e12a | |
|
da09edb63c | |
|
e4f8f268e8 | |
|
ecf4224d46 | |
|
0dd173c51f | |
|
2813b15c58 | |
|
0f60f182e8 | |
|
aed88103f1 | |
|
13b65d06e9 | |
|
098d5ba360 | |
|
ea591f5ac3 | |
|
d5346f245c | |
|
8dac51c9e7 | |
|
775a138ad6 | |
|
6c34d567d4 | |
|
4c92bd54a2 | |
|
a1bc4f865d | |
|
a78ccb3612 | |
|
1626b85f13 | |
|
9aa45e0cee | |
|
7ed9c90baf | |
|
e37dcfc3ff | |
|
c0e4778fc0 | |
|
fda3234b64 | |
|
d4aa574df2 | |
|
8cd92dbaba | |
|
325938f2d2 | |
|
10378c8b11 | |
|
629774d3ed | |
|
014cd7d6ac | |
|
bc72eff716 | |
|
ce1d2c5c53 | |
|
b48e438737 | |
|
e14c3e4ae5 | |
|
58211f19f7 | |
|
b193e6e392 | |
|
1667bae479 | |
|
953ca74ac9 | |
|
c2e0519a1f | |
|
c74bf4e01c | |
|
e8623bdba7 | |
|
e4d293eb51 | |
|
e14b3921e8 | |
|
b0ede7b09c | |
|
af3f5c5882 | |
|
9769baefb9 | |
|
855780c9c1 | |
|
74c95a2486 | |
|
31fe5c1534 | |
|
08b2255c33 | |
|
faa2923c51 | |
|
9444907a56 | |
|
7dd7c14868 | |
|
d1166d3495 | |
|
008a62bb90 | |
|
b6235fb72d | |
|
e1385935b8 | |
|
ef98b9612e | |
|
58017fd35e | |
|
d0e447d8e1 | |
|
b32c1c5bd4 | |
|
f24dbb13f7 | |
|
45c3445b2a | |
|
84eb1e338f | |
|
689a066c90 | |
|
5b031d63cc | |
|
ce82f2a81b | |
|
f262b500fd | |
|
c225435bea | |
|
1002df5e13 | |
|
18630b6c78 | |
|
0fba03ef7a | |
|
e9eddcc6d3 | |
|
3704fa72a9 | |
|
552b530e0b | |
|
30e04d41fa | |
|
bdaa44eb23 | |
|
9f639dd892 | |
|
e3c396e324 | |
|
73a120de57 | |
|
34b265af34 | |
|
d88694fbd1 | |
|
07900633cb | |
|
bf157f81f8 | |
|
e31cf7b137 | |
|
07b7a42624 | |
|
27dcab4ba5 | |
|
aec1c74025 | |
|
a5aadf719a | |
|
698c8b067c | |
|
d04bb3a5b0 | |
|
b3653a0aff | |
|
95829b8991 | |
|
fdd522a951 | |
|
5326e106f0 | |
|
ed94dff2cd | |
|
65e4aa3c5e | |
|
fb498567b4 | |
|
76bf7b7e77 | |
|
5210373640 | |
|
e43459d86d | |
|
eeab0ab06f | |
|
be3b1ad382 | |
|
0d276ac19f | |
|
e2ef1de56a | |
|
d4a00d4f20 | |
|
188340e3e9 | |
|
e56fb7de12 | |
|
2bb82faa7b | |
|
79ffff83cb | |
|
e9922b0da7 | |
|
d8e9d550dc | |
|
8283e091cd | |
|
1bcf025f67 | |
|
574b25418f | |
|
9ad24ea2c7 | |
|
f58f6cd208 | |
|
af7c925522 | |
|
c4311bd207 | |
|
ba1e0b3146 | |
|
8b33e32e3d | |
|
568fbe8437 | |
|
2077606ba3 | |
|
9ff6b0bde4 | |
|
469ba765fd | |
|
adbe770d74 | |
|
4ce2aca621 | |
|
e0fa1d2898 | |
|
608e129d8f | |
|
cae2cad3a5 | |
|
594c1b6583 | |
|
c3c53894c3 | |
|
c9da164ae6 | |
|
c3a3774cf1 | |
|
c9edf4072e | |
|
a8f7a9f270 | |
|
09b7fb8814 | |
|
5953ba1261 | |
|
ed99195ed6 | |
|
db83d7fe0b | |
|
471ab88240 | |
|
1bf525de79 | |
|
c2b2b0b3df | |
|
e14abd4ea5 | |
|
55586431bd | |
|
967fe3fbc7 | |
|
5fd18a117f | |
|
fd51f17ec1 | |
|
d605f87d6d | |
|
1ccff37f96 | |
|
d573b5d00f | |
|
6b538a5d4e | |
|
e6fbdd434a | |
|
6e30b17476 | |
|
da422bb452 | |
|
e992542b57 | |
|
6dc23ca804 | |
|
339e243472 | |
|
b5e4ef628b | |
|
c27b4beb6d | |
|
f116c9264c | |
|
75095b2573 | |
|
af2226183f | |
|
8ec3f36293 | |
|
d4aeca09f5 | |
|
b610240ce3 | |
|
aec734d822 | |
|
7fc7947bc3 | |
|
343e0f226c | |
|
9fd58e318f | |
|
7cc8ec6315 | |
|
a50e83a5c3 | |
|
c658f9717b | |
|
22157af0e5 | |
|
d229082e26 | |
|
4906ebb182 | |
|
a7adf55137 | |
|
6e57ca6e6c | |
|
948f634d8f | |
|
00fc95a16a | |
|
a5fd95c982 | |
|
6dbe19abbd | |
|
e8b55acc2b | |
|
706bf35086 | |
|
2e0ff3d14c | |
|
429777eb5d | |
|
07317328f1 | |
|
e6ab24db7f | |
|
d88e0dda02 | |
|
a117c0c056 | |
|
83e520784b | |
|
92e63b5991 | |
|
ff4af1b398 | |
|
e98f0c09ba | |
|
f601956af9 | |
|
2415e30efe | |
|
6163859ae8 | |
|
b586bd9231 | |
|
7b6805491c | |
|
8578b779e2 | |
|
a83ef25930 | |
|
2bf62c0180 | |
|
005e4e0259 | |
|
0b34230dd5 | |
|
95056202c6 | |
|
d77d8f2992 | |
|
0dc032e76f | |
|
ed3111fec1 | |
|
49fbd5cf4b | |
|
d1c8a8bfe2 | |
|
80fc2c206e | |
|
7f0a62683e | |
|
df6320d147 | |
|
0842910049 | |
|
de33c801a5 | |
|
cc6c049522 | |
|
0afa7cc6ff | |
|
169ff4f9fe | |
|
2f959a773c | |
|
7bd6e85b29 | |
|
0127a75e05 | |
|
b6d8069610 | |
|
6de3fabc9f | |
|
9b2d0be950 | |
|
5c85ab20f5 | |
|
2fceddf00e | |
|
92745daa62 | |
|
d8b2940b3c | |
|
5560df8cba | |
|
8f9c5bbabb | |
|
d00659c642 | |
|
3fba7a9e86 | |
|
6e3260c43c | |
|
9a9b06d24d | |
|
7bc362cfdc | |
|
341af62275 | |
|
2d5de8d0fa | |
|
1c9447854f | |
|
c9ffa67ec4 | |
|
6809f445eb | |
|
04e8d009d4 | |
|
72f1672634 | |
|
72ad051dd6 | |
|
1471f74d98 | |
|
56122ce0dd | |
|
b1bd8e7424 | |
|
a39a7c6e0f | |
|
011b9e6a46 | |
|
9344c938bb | |
|
51508603fe | |
|
c083db10f0 | |
|
a0abe5c667 | |
|
9c23553e0b | |
|
8603b5b98b | |
|
285516dc10 | |
|
11ddb5e6bf | |
|
78c11c4ceb | |
|
5e300846b2 | |
|
d764b1ab87 | |
|
4412a2b9a4 | |
|
f508ccea7b | |
|
68314853b8 | |
|
20c3b6f13c | |
|
bdbf6b3df9 | |
|
84259052d1 | |
|
c370cfb68a | |
|
bc89bbce56 | |
|
363d01392a | |
|
0b64594d0a | |
|
3e9834e26d | |
|
e7fe0b20dc | |
|
0761e11cc4 | |
|
a859b5f027 | |
|
36dc9081ef | |
|
760d252808 | |
|
0de6fae1f8 | |
|
e7d28a3bf1 | |
|
3344efd552 | |
|
56c592a5d7 | |
|
1123fd22cb | |
|
393a9401b1 | |
|
fd6c80b840 | |
|
3c3609b5fa | |
|
7a33650863 | |
|
a276a05765 | |
|
dd0d0d71ab | |
|
26f070bfd4 | |
|
ca95d61bf8 | |
|
f1aa82a9ae | |
|
aa5c7ec00d | |
|
203116b614 | |
|
383be3edec | |
|
50ba775915 | |
|
68bf26b08f | |
|
fe09e416bd | |
|
f9199e56c5 | |
|
870ce7ce75 | |
|
7c5e1385cf | |
|
d8ce535dc3 | |
|
49526abf27 | |
|
62a5f8888e | |
|
ebdd9038b7 | |
|
c8629cea5d | |
|
70f79831de | |
|
cbb029d905 | |
|
a0b0f9460f | |
|
220f0b00f1 | |
|
e349323507 | |
|
93badb28ac | |
|
d52844ae67 | |
|
ed97725ea1 | |
|
fae6181a54 | |
|
a14577dfa4 | |
|
7b51a90328 | |
|
94d8373a9e | |
|
1150ce519f | |
|
c266c431f5 | |
|
9ce0dbfbd0 | |
|
84a54c5447 | |
|
bd2a900a37 | |
|
f27c3a8da9 | |
|
caf2bad7b6 | |
|
b409875246 | |
|
a3b928467e | |
|
f69d0c8ddc | |
|
8e94c930ee | |
|
9c541692ee | |
|
a79b87ce7e | |
|
f0ab65348f | |
|
01fa5b3afd | |
|
228f0f5700 | |
|
d4933875ed | |
|
c7ce65ddb6 | |
|
badc7ff781 | |
|
04e2b5f2aa | |
|
01cd8dd08c | |
|
255c258d7b | |
|
3d9e172a85 | |
|
5ef09ca911 | |
|
4249a104c2 | |
|
da15eb9afe | |
|
031e658210 | |
|
c4e5400ed6 | |
|
744a689454 | |
|
c769807582 | |
|
f89af6309c | |
|
857754c384 | |
|
fc0edbd222 | |
|
a7f78c5668 | |
|
8acd791fa7 | |
|
344daabaa7 | |
|
1e8008bded | |
|
7b1ea68a5e | |
|
4181ece888 | |
|
cb8534b79b | |
|
fb8bbe91d7 | |
|
e88df52f95 | |
|
f83f214d39 | |
|
06b5503348 | |
|
adf4c720b2 | |
|
ee5f2d1aa5 | |
|
100f2bf8e6 | |
|
efe02543c0 | |
|
7ecb76f31a | |
|
fc4f167caa | |
|
f3968f11ab | |
|
5c1cabf237 | |
|
487915e9e4 | |
|
e842171ba0 | |
|
49f592d67d | |
|
7e7bc2271e | |
|
e7511e6eeb | |
|
f604a5ae7d | |
|
21d5ec6761 | |
|
c2aceee61d | |
|
422c088d62 | |
|
312f96a5a4 | |
|
98ba606d4f | |
|
8648fe265a | |
|
e34e2763cf | |
|
7d87c16e03 | |
|
144fad7706 | |
|
c2ad21a380 | |
|
1a7aa6505d | |
|
2a2bab3d28 | |
|
45f70a8b26 | |
|
c2d7a7be62 | |
|
a8a1d30310 | |
|
19fefd773f | |
|
2cb1195f18 | |
|
f13d2a5449 | |
|
adc587f222 | |
|
71098097c0 | |
|
4f68b251ac | |
|
b951f24297 | |
|
d6d20e49fa | |
|
989a15bf3a | |
|
f89f620909 | |
|
f564d9092a | |
|
8c16b56476 | |
|
eb38b4b598 | |
|
041b77bd32 | |
|
a210b30d36 | |
|
a451a892ae | |
|
1da1f28cef | |
|
4ad49bbd84 | |
|
4dccc1ce24 | |
|
4085da817d | |
|
aadb16b3d4 | |
|
8f2a94fd7e | |
|
047958a49c | |
|
ffc46f977d | |
|
4adec4bbc6 | |
|
bf51d6600e | |
|
1e917af560 | |
|
6956e6074d | |
|
ed783da499 | |
|
2b50e4af1a | |
|
944efce3a6 | |
|
59536256e3 | |
|
112d53b10a | |
|
b51cb3219f | |
|
0c258bb704 | |
|
438d014389 | |
|
3abcfb7063 | |
|
d8ea2538de | |
|
cff4a54d6a | |
|
5919888571 | |
|
2d53c0a2a6 | |
|
33571a312d | |
|
06e5a875be | |
|
1550882948 | |
|
35bfe697a5 | |
|
db35f6a857 | |
|
2513756583 | |
|
925ea7393c | |
|
f01b5e5cfe | |
|
d39915d392 | |
|
0fb464c24a | |
|
6b650e785e | |
|
589411702a | |
|
f984abbe2e | |
|
9dea1cf665 | |
|
9b587abc13 |
|
@ -0,0 +1,38 @@
|
|||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: github-actions
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
groups:
|
||||
actions-all:
|
||||
patterns:
|
||||
- "*"
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: docker
|
||||
directory: /
|
||||
schedule:
|
||||
interval: weekly
|
||||
labels:
|
||||
- "ok-to-test"
|
||||
|
||||
- package-ecosystem: gomod
|
||||
directories:
|
||||
- /
|
||||
- /test
|
||||
schedule:
|
||||
interval: weekly
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types:
|
||||
- "version-update:semver-major"
|
||||
- "version-update:semver-minor"
|
||||
groups:
|
||||
k8s:
|
||||
patterns:
|
||||
- "k8s.io/*"
|
||||
- "sigs.k8s.io/*"
|
||||
labels:
|
||||
- "ok-to-test"
|
|
@ -0,0 +1,78 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: ["master"]
|
||||
schedule:
|
||||
- cron: "0 0 * * 1"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: ["go"]
|
||||
# CodeQL supports [ $supported-codeql-languages ]
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
|
@ -0,0 +1,27 @@
|
|||
# Dependency Review Action
|
||||
#
|
||||
# This Action will scan dependency manifest files that change as part of a Pull Request,
|
||||
# surfacing known-vulnerable versions of the packages declared or updated in the PR.
|
||||
# Once installed, if the workflow run is marked as required,
|
||||
# PRs introducing known-vulnerable packages will be blocked from merging.
|
||||
#
|
||||
# Source repository: https://github.com/actions/dependency-review-action
|
||||
name: 'Dependency Review'
|
||||
on: [pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
dependency-review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: 'Checkout Repository'
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: 'Dependency Review'
|
||||
uses: actions/dependency-review-action@da24556b548a50705dd671f47852072ea4c105d9 # v4.7.1
|
|
@ -0,0 +1,76 @@
|
|||
# This workflow uses actions that are not certified by GitHub. They are provided
|
||||
# by a third-party and are governed by separate terms of service, privacy
|
||||
# policy, and support documentation.
|
||||
|
||||
name: Scorecard supply-chain security
|
||||
on:
|
||||
# For Branch-Protection check. Only the default branch is supported. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
|
||||
branch_protection_rule:
|
||||
# To guarantee Maintained check is occasionally updated. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
|
||||
schedule:
|
||||
- cron: '20 7 * * 2'
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecard analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results and get a badge (see publish_results below).
|
||||
id-token: write
|
||||
contents: read
|
||||
actions: read
|
||||
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
|
||||
# - you want to enable the Branch-Protection check on a *public* repository, or
|
||||
# - you are installing Scorecards on a *private* repository
|
||||
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
|
||||
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
|
||||
|
||||
# Public repositories:
|
||||
# - Publish results to OpenSSF REST API for easy access by consumers
|
||||
# - Allows the repository to include the Scorecard badge.
|
||||
# - See https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories:
|
||||
# - `publish_results` will always be set to `false`, regardless
|
||||
# of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@ce28f5bb42b7a9f2c824e633a3f6ee835bab6858 # v3.29.0
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -0,0 +1,33 @@
|
|||
name: tag-release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- version.txt
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
if: ${{ github.repository == 'kubernetes/node-problem-detector' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- run: /usr/bin/git config --global user.email actions@github.com
|
||||
- run: /usr/bin/git config --global user.name 'GitHub Actions Release Tagger'
|
||||
- run: hack/tag-release.sh
|
||||
id: tag_release
|
||||
outputs:
|
||||
release_tag: ${{ steps.tag_release.outputs.release_tag }}
|
|
@ -1,7 +1,10 @@
|
|||
/bin/
|
||||
/Dockerfile
|
||||
/test/bin/
|
||||
/*.tar.gz
|
||||
/*.tar.gz*
|
||||
ci.env
|
||||
pr.env
|
||||
junit*.xml
|
||||
debug.test
|
||||
/output/
|
||||
coverage.out
|
||||
.idea/
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
repos:
|
||||
- repo: https://github.com/gitleaks/gitleaks
|
||||
rev: v8.16.3
|
||||
hooks:
|
||||
- id: gitleaks
|
||||
- repo: https://github.com/golangci/golangci-lint
|
||||
rev: v1.52.2
|
||||
hooks:
|
||||
- id: golangci-lint
|
||||
- repo: https://github.com/jumanjihouse/pre-commit-hooks
|
||||
rev: 3.0.0
|
||||
hooks:
|
||||
- id: shellcheck
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
33
.travis.yml
33
.travis.yml
|
@ -1,33 +0,0 @@
|
|||
os:
|
||||
- linux
|
||||
sudo: required
|
||||
dist: trusty
|
||||
language: go
|
||||
go:
|
||||
- "1.11"
|
||||
- "1.12"
|
||||
- master
|
||||
env:
|
||||
- GO111MODULE=on
|
||||
services:
|
||||
- docker
|
||||
before_install:
|
||||
- sudo apt-get -qq update
|
||||
- sudo apt-get install -y libsystemd-journal-dev
|
||||
install:
|
||||
- mkdir -p $HOME/gopath/src/k8s.io
|
||||
- mv $TRAVIS_BUILD_DIR $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
- cd $HOME/gopath/src/k8s.io/node-problem-detector
|
||||
script:
|
||||
- make
|
||||
- make test
|
||||
- make clean && BUILD_TAGS="disable_custom_plugin_monitor" make
|
||||
- BUILD_TAGS="disable_custom_plugin_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_log_monitor" make
|
||||
- BUILD_TAGS="disable_system_log_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_system_stats_monitor" make
|
||||
- BUILD_TAGS="disable_system_stats_monitor" make test
|
||||
- make clean && BUILD_TAGS="disable_stackdriver_exporter" make
|
||||
- BUILD_TAGS="disable_stackdriver_exporter" make test
|
||||
- make clean && ENABLE_JOURNALD=0 make
|
||||
- ENABLE_JOURNALD=0 make test
|
344
CHANGELOG.md
344
CHANGELOG.md
|
@ -1,29 +1,363 @@
|
|||
# Change Log
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](http://keepachangelog.com/)
|
||||
and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.8.7] - 2020-02-18
|
||||
|
||||
### Added
|
||||
- Add travis presubmit test.
|
||||
|
||||
- Add memory read error.
|
||||
- Add support for building NPD on MacOS.
|
||||
- Add support for containerd health check.
|
||||
- Add metric for `per-cpu`, `per-stage` timing.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix an issue that kubelet may be restarted by NPD health checker unexpectedly. Make log pattern check configurable in health checker.
|
||||
- Exit the process when there is a timeout in plugin daemon
|
||||
|
||||
|
||||
## [0.8.6] - 2020-01-22
|
||||
|
||||
### Added
|
||||
|
||||
- Windows build now supported.
|
||||
- Added metrics to retrieve stats such as `procs_running` and `procs_blocked`.
|
||||
- Added metrics to retrieve network stats.
|
||||
- Added metric to retrieve guest OS features such as unknown modules, ktd,
|
||||
and kernel integrity.
|
||||
|
||||
### Changed
|
||||
|
||||
- Print result's message when status is unknown.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed custom plugin command timeout when the command spawns a long running
|
||||
child process.
|
||||
|
||||
## [0.8.5] - 2020-11-18
|
||||
|
||||
### Added
|
||||
|
||||
- Added problem detection for buffer I/O error.
|
||||
- Added CPU load average metrics support.
|
||||
- Added kubelet apiserver connection check in health checker.
|
||||
|
||||
### Changed
|
||||
|
||||
- Will now catch hung task with pattern like `tasks airflow scheduler: *`.
|
||||
- Better handling to avoid duplicating disk bytes metrics used on fstype and
|
||||
mount types.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed the deployment yaml to prevent NPD from scheduling onto windows nodes.
|
||||
- Fixed memory unit for `/proc/meminfo` metrics.
|
||||
- Fixed OOMKilling detection for new linux kernel v5.1+.
|
||||
|
||||
## [0.8.4] - 2020-09-01
|
||||
|
||||
### Added
|
||||
|
||||
- Added `FSType` and `MountOption` as labels to the metric `disk_usage_bytes`.
|
||||
- Added `DockerContainerStartupFailure` event in `docker-monitor.json` to
|
||||
detect docker issue
|
||||
[docker/for-linux#647](https://github.com/docker/for-linux/issues/647).
|
||||
|
||||
### Fixed
|
||||
|
||||
- Reduced log spam generated by the custom plugin monitor.
|
||||
|
||||
## [0.8.3] - 2020-06-30
|
||||
|
||||
### Added
|
||||
|
||||
- `health-checker` binary now included in the docker image.
|
||||
|
||||
### Changed
|
||||
|
||||
- `--enable-repair=true` is now the default for docker and kubelet health
|
||||
checker.
|
||||
- Custom plugin will now only generate status update log when the status has
|
||||
changed.
|
||||
- Limit the size of custom plugin output to 4kb, extra output will be drained
|
||||
and discarded.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix a race condition that services may be killed periodically when
|
||||
`--enable-repair=true`, and systemd service restart time equals the health
|
||||
check period.
|
||||
|
||||
## [0.8.2] - 2020-05-28
|
||||
|
||||
### Added
|
||||
|
||||
- Added an `--event-namespace` flag to make event namespace configurable.
|
||||
- Added `rhel` support in OS version.
|
||||
- Added `health-checker` as a custom plugin. The `health-checker` can be used
|
||||
to monitor healthiness of kubelet, docker and CRI container runtimes (e.g.
|
||||
`containerd`, `cri-o`) and restart them if they are not healthy if
|
||||
`enable-repair` is turned on.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#420](https://github.com/kubernetes/node-problem-detector/issues/420) Added
|
||||
missing `lsblk` to the container image.
|
||||
|
||||
## [0.8.1] - 2020-02-25
|
||||
|
||||
### Added
|
||||
|
||||
- Added `host_uptime` metrics for CentOS.
|
||||
- Now collecting a lot more useful CPU/disk/memory metrics.
|
||||
|
||||
### Changed
|
||||
|
||||
- Improved `network_problem.sh` to support `nf_conntrack` and report error when
|
||||
conntrack table is 90% full.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#366](https://github.com/kubernetes/node-problem-detector/issues/366) Fixed
|
||||
building with `ENABLE_JOURNALD=0`.
|
||||
- Fixed the first 0 value metrics reported for `disk_avg_queue_len`.
|
||||
- Fix a few metric units for disk metrics and the calculation for
|
||||
`disk_avg_queue_len`.
|
||||
|
||||
## [0.8.0] - 2019-10-30
|
||||
|
||||
### Added
|
||||
|
||||
- Added Stackdriver exporter.
|
||||
- Added a `k8s-exporter-heartbeat-period` flag to make the heart beat period
|
||||
of K8s exporter configurable.
|
||||
|
||||
### Changed
|
||||
|
||||
- Changed the default heart beat period of K8s exporter from `1m` to `5m`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Addressed an issue with a panic caused by closing an already closed channel.
|
||||
- Fixed several potential busy loops.
|
||||
|
||||
## [0.7.1] - 2019-08-27
|
||||
|
||||
### Added
|
||||
|
||||
- Added validation that permanent problems habe a preset default condition.
|
||||
|
||||
### Changed
|
||||
|
||||
- Empty LogPath will now use journald's default path.
|
||||
- Systemd monitor now looks back 5 minutes.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:1.0.0`.
|
||||
- Updated the detection method for docker overlay2 issues.
|
||||
- Moved NPD into the kube-system namespace.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#202](https://github.com/kubernetes/node-problem-detector/issues/202) Fixed
|
||||
an issue that condition can't switch back to false for custom plugins.
|
||||
|
||||
## [0.7.0] - 2019-07-25
|
||||
|
||||
### Added
|
||||
|
||||
- Added a system stats monitor is added into NPD as a new problem daemon. It
|
||||
collects useful node problem related system stats with OpenCensus such as
|
||||
`disk/io_time`, `disk/weighted_io` and `disk/avg_queue_len`.
|
||||
- Besides node condition and events, problems detected by existing problem
|
||||
daemons are also collected into OpenCensus as metrics:
|
||||
`problem_counter{reason="PROBLEM_REASON"} xxx` for events and
|
||||
`problem_gauge{reason="PROBLEM_REASON",type="PROBLEM_TYPE"} 1 or 0` for
|
||||
conditions.
|
||||
- A Prometheus exporter is added to export all OpenCensus metrics collected by
|
||||
NPD through Prometheus.
|
||||
- A plugin system for problem daemons is added. Problem daemons can be disabled
|
||||
at compile time with build tags, such as `disable_system_stats_monitor`,
|
||||
`disable_system_log_monitor` and `disable_custom_plugin_monitor`.
|
||||
- A problem exporter interface is added. The original kubernetes problem
|
||||
reporting logic was moved into `k8sexporter`. Prometheus support is
|
||||
implemented as `prometheusexporter`.
|
||||
|
||||
## [0.6.6] - 2019-08-13
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated the detection method for docker overlay2 issues.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#202](https://github.com/kubernetes/node-problem-detector/issues/202) Fixed
|
||||
an issue that condition can't switch back to false for custom plugins.
|
||||
|
||||
## [0.6.5] - 2019-07-24
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#295](https://github.com/kubernetes/node-problem-detector/issues/295) Added
|
||||
configurable timeout to wait for apiserver to be ready before starting
|
||||
problem detection.
|
||||
|
||||
## [0.6.4] - 2019-06-13
|
||||
|
||||
### Changed
|
||||
|
||||
- Switch from godep to go modules resulting in bumping versions of many
|
||||
dependencies.
|
||||
- Changed custom plugin handling to run immediately on startup.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#269](https://github.com/kubernetes/node-problem-detector/issues/269) Fixed
|
||||
issue so that using `--version` should not require monitors to be specified.
|
||||
|
||||
## [0.6.3] - 2019-04-05
|
||||
|
||||
### Added
|
||||
|
||||
- Added better handling and reporting when missing required flags.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Disabled glog writing to files for the log-counter plugin.
|
||||
|
||||
## [0.6.2] - 2019-01-07
|
||||
|
||||
### Added
|
||||
|
||||
- Added resource limites to NPD deployment.
|
||||
- Added log-counter to dockerfile.
|
||||
- Added `enable_message_change_based_condition_update` option to enable
|
||||
condition update when messages change for custom plugin.
|
||||
|
||||
### Fixed
|
||||
|
||||
- [#232](https://github.com/kubernetes/node-problem-detector/issues/232) Explicitly
|
||||
include libsystemd0 in the image.
|
||||
|
||||
## [0.6.1] - 2018-11-28
|
||||
|
||||
### Changed
|
||||
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.4.0`.
|
||||
|
||||
## [0.6.0] - 2018-11-27
|
||||
|
||||
### Added
|
||||
|
||||
- Added ConfigMap for NPD config.
|
||||
- Added readonly filesystem detection.
|
||||
- Added frequent kubelet/docker restart detection.
|
||||
- Added corrupt docker overlay2 issue detection.
|
||||
|
||||
### Changed
|
||||
|
||||
- Bumped Kubernetes client version to 1.9.
|
||||
- Updated OOMKilling pattern to support new kernel.
|
||||
|
||||
## [0.5.0] - 2018-06-22
|
||||
|
||||
### Added
|
||||
|
||||
- Added custom problem detector plugin interface.
|
||||
- Added custom network plugin monitor.
|
||||
- Added a kernel log counter custom problem detector to detect problems which
|
||||
have the same pattern.
|
||||
|
||||
### Changed
|
||||
|
||||
- Changed default port from 10256 to 20256 to avoid conflict with kube-proxy.
|
||||
- Bumped golang version from 1.8 to 1.9.
|
||||
- Bumped base image to `registry.k8s.io/debian-base-amd64:0.3`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an error in the labels applied to the daemonset label selector.
|
||||
|
||||
## [0.4.1] - 2017-06-21
|
||||
|
||||
### Added
|
||||
|
||||
- Added docker image pull error detection.
|
||||
|
||||
## [0.4.0] - 2017-04-31
|
||||
|
||||
### Added
|
||||
|
||||
- Added "kernel log generator" container for test purposes.
|
||||
- Added ABRT adaptor config.
|
||||
|
||||
## [0.3.0] - 2017-03-15
|
||||
|
||||
### Added
|
||||
|
||||
- Added look back support in kernel monitor. Kernel monitor will look back for
|
||||
specified amount of time to detect old problems during each start or restart.
|
||||
- Added support for running node-problem-detector standalone.
|
||||
- Added `-hostname-override` option to provide custom node name.
|
||||
- Added `-port` option to provide custom listening port for service.
|
||||
- Added `-address` option to define binding address.
|
||||
- Added journald support.
|
||||
- Added travis presubmit test.
|
||||
- Added arbitrary system log support.
|
||||
|
||||
### Changed
|
||||
|
||||
- Update kubernetes version to v1.4.0-beta.3
|
||||
|
||||
### Fixed
|
||||
|
||||
- Only change transition timestamp when condition has changed.
|
||||
- [#47](https://github.com/kubernetes/node-problem-detector/issues/47) Don't
|
||||
report KernelDeadlock on `unregister_netdevice` event.
|
||||
- [#48](https://github.com/kubernetes/node-problem-detector/issues/48) Use
|
||||
system boot time instead of "StartPattern".
|
||||
|
||||
## [0.2.0] - 2016-08-23
|
||||
|
||||
### Added
|
||||
- Add look back support in kernel monitor. Kernel monitor will look back for
|
||||
specified amount of time to detect old problems during each start or restart.
|
||||
|
||||
- Add support for some kernel oops detection.
|
||||
|
||||
### Changed
|
||||
|
||||
- Change NPD to get node name from `NODE_NAME` env first before `os.Hostname`,
|
||||
and update the example to get node name from downward api and set `NODE_NAME`.
|
||||
|
||||
## 0.1.0 - 2016-06-09
|
||||
|
||||
### Added
|
||||
|
||||
- Initial version of node problem detector.
|
||||
|
||||
[Unreleased]: https://github.com/kubernetes/node-problem-detector/compare/v0.2...HEAD
|
||||
[0.2.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.1...v0.2
|
||||
[Unreleased]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.6...HEAD
|
||||
[0.8.6]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.5...v0.8.6
|
||||
[0.8.5]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.4...v0.8.5
|
||||
[0.8.4]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.3...v0.8.4
|
||||
[0.8.3]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.2...v0.8.3
|
||||
[0.8.2]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.1...v0.8.2
|
||||
[0.8.1]: https://github.com/kubernetes/node-problem-detector/compare/v0.8.0...v0.8.1
|
||||
[0.8.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.7.0...v0.8.0
|
||||
[0.7.1]: https://github.com/kubernetes/node-problem-detector/compare/v0.7.0...v0.7.1
|
||||
[0.7.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.6...v0.7.0
|
||||
[0.6.6]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.5...v0.6.6
|
||||
[0.6.5]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.4...v0.6.5
|
||||
[0.6.4]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.3...v0.6.4
|
||||
[0.6.3]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.2...v0.6.3
|
||||
[0.6.2]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.1...v0.6.2
|
||||
[0.6.1]: https://github.com/kubernetes/node-problem-detector/compare/v0.6.0...v0.6.1
|
||||
[0.6.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.5.0...v0.6.0
|
||||
[0.5.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.4.1...v0.5.0
|
||||
[0.4.1]: https://github.com/kubernetes/node-problem-detector/compare/v0.4.0...v0.4.1
|
||||
[0.4.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.3.0...v0.4.0
|
||||
[0.3.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.2.0...v0.3.0
|
||||
[0.2.0]: https://github.com/kubernetes/node-problem-detector/compare/v0.1.0...v0.2.0
|
||||
|
|
|
@ -14,7 +14,7 @@ If your repo has certain guidelines for contribution, put them here ahead of the
|
|||
|
||||
- [Contributor License Agreement](https://git.k8s.io/community/CLA.md) Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests
|
||||
- [Kubernetes Contributor Guide](http://git.k8s.io/community/contributors/guide) - Main contributor documentation, or you can just jump directly to the [contributing section](http://git.k8s.io/community/contributors/guide#contributing)
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet.md) - Common resources for existing developers
|
||||
- [Contributor Cheat Sheet](https://git.k8s.io/community/contributors/guide/contributor-cheatsheet/README.md) - Common resources for existing developers
|
||||
|
||||
## Mentorship
|
||||
|
||||
|
@ -28,4 +28,4 @@ Custom Information - if you're copying this template for the first time you can
|
|||
- [Slack channel](https://kubernetes.slack.com/messages/kubernetes-users) - Replace `kubernetes-users` with your slack channel string, this will send users directly to your channel.
|
||||
- [Mailing list](URL)
|
||||
|
||||
-->
|
||||
-->
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
# Copyright 2016 The Kubernetes Authors All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# "builder-base" can be overriden using dockerb buildx's --build-context flag,
|
||||
# by users who want to use a different images for the builder. E.g. if you need to use an older OS
|
||||
# to avoid dependencies on very recent glibc versions.
|
||||
# E.g. of the param: --build-context builder-base=docker-image://golang:<something>@sha256:<something>
|
||||
# Must override builder-base, not builder, since the latter is referred to later in the file and so must not be
|
||||
# directly replaced. See here, and note that "stage" parameter mentioned there has been renamed to
|
||||
# "build-context": https://github.com/docker/buildx/pull/904#issuecomment-1005871838
|
||||
FROM golang:1.24-bookworm@sha256:00eccd446e023d3cd9566c25a6e6a02b90db3e1e0bbe26a48fc29cd96e800901 as builder-base
|
||||
FROM builder-base as builder
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update --fix-missing && apt-get --yes install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
RUN go version
|
||||
|
||||
COPY . /gopath/src/k8s.io/node-problem-detector/
|
||||
WORKDIR /gopath/src/k8s.io/node-problem-detector
|
||||
RUN GOARCH=${TARGETARCH} make bin/node-problem-detector bin/health-checker bin/log-counter
|
||||
|
||||
FROM --platform=${TARGETPLATFORM} registry.k8s.io/build-image/debian-base:bookworm-v1.0.4@sha256:0a17678966f63e82e9c5e246d9e654836a33e13650a698adefede61bb5ca099e as base
|
||||
|
||||
LABEL maintainer="Random Liu <lantaol@google.com>"
|
||||
|
||||
RUN clean-install util-linux bash libsystemd-dev
|
||||
|
||||
# Avoid symlink of /etc/localtime.
|
||||
RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true
|
||||
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/node-problem-detector /node-problem-detector
|
||||
|
||||
ARG LOGCOUNTER
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/bin/health-checker /gopath/src/k8s.io/node-problem-detector/${LOGCOUNTER} /home/kubernetes/bin/
|
||||
|
||||
COPY --from=builder /gopath/src/k8s.io/node-problem-detector/config/ /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json"]
|
|
@ -1,30 +0,0 @@
|
|||
# Copyright 2016 The Kubernetes Authors All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM @BASEIMAGE@
|
||||
MAINTAINER Random Liu <lantaol@google.com>
|
||||
|
||||
RUN clean-install util-linux libsystemd0 bash
|
||||
|
||||
# Avoid symlink of /etc/localtime.
|
||||
RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true
|
||||
|
||||
ADD ./bin/node-problem-detector /node-problem-detector
|
||||
ADD ./bin/health-checker /home/kubernetes/bin/health-checker
|
||||
|
||||
# Below command depends on ENABLE_JOURNAL=1.
|
||||
ADD ./bin/log-counter /home/kubernetes/bin/log-counter
|
||||
|
||||
ADD config /config
|
||||
ENTRYPOINT ["/node-problem-detector", "--system-log-monitors=/config/kernel-monitor.json"]
|
228
Makefile
228
Makefile
|
@ -14,12 +14,20 @@
|
|||
|
||||
# Build the node-problem-detector image.
|
||||
|
||||
.PHONY: all build-container build-tar build push-container push-tar push \
|
||||
clean vet fmt version \
|
||||
Dockerfile build-binaries docker-builder build-in-docker
|
||||
.PHONY: all \
|
||||
vet fmt version test e2e-test \
|
||||
build-binaries build-container build-tar build \
|
||||
docker-builder build-in-docker \
|
||||
push-container push-tar push release clean depup \
|
||||
print-tar-sha-md5
|
||||
|
||||
all: build
|
||||
|
||||
# PLATFORMS is the set of OS_ARCH that NPD can build against.
|
||||
LINUX_PLATFORMS=linux_amd64 linux_arm64
|
||||
DOCKER_PLATFORMS=linux/amd64,linux/arm64
|
||||
PLATFORMS=$(LINUX_PLATFORMS) windows_amd64
|
||||
|
||||
# VERSION is the version of the binary.
|
||||
VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else echo "UNKNOWN"; fi)
|
||||
|
||||
|
@ -27,7 +35,7 @@ VERSION?=$(shell if [ -d .git ]; then echo `git describe --tags --dirty`; else e
|
|||
TAG?=$(VERSION)
|
||||
|
||||
# REGISTRY is the container registry to push into.
|
||||
REGISTRY?=staging-k8s.gcr.io
|
||||
REGISTRY?=gcr.io/k8s-staging-npd
|
||||
|
||||
# UPLOAD_PATH is the cloud storage path to upload release tar.
|
||||
UPLOAD_PATH?=gs://kubernetes-release
|
||||
|
@ -38,13 +46,19 @@ UPLOAD_PATH:=$(shell echo $(UPLOAD_PATH) | sed '$$s/\/*$$//')
|
|||
PKG:=k8s.io/node-problem-detector
|
||||
|
||||
# PKG_SOURCES are all the go source code.
|
||||
ifeq ($(OS),Windows_NT)
|
||||
PKG_SOURCES:=
|
||||
# TODO: File change detection does not work in Windows.
|
||||
else
|
||||
PKG_SOURCES:=$(shell find pkg cmd -name '*.go')
|
||||
endif
|
||||
|
||||
# PARALLEL specifies the number of parallel test nodes to run for e2e tests.
|
||||
PARALLEL?=3
|
||||
|
||||
NPD_NAME_VERSION?=node-problem-detector-$(VERSION)
|
||||
# TARBALL is the name of release tar. Include binary version by default.
|
||||
TARBALL?=node-problem-detector-$(VERSION).tar.gz
|
||||
TARBALL=$(NPD_NAME_VERSION).tar.gz
|
||||
|
||||
# IMAGE is the image name of the node problem detector container image.
|
||||
IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
|
||||
|
@ -53,32 +67,56 @@ IMAGE:=$(REGISTRY)/node-problem-detector:$(TAG)
|
|||
# support needs libsystemd-dev or libsystemd-journal-dev.
|
||||
ENABLE_JOURNALD?=1
|
||||
|
||||
# TODO(random-liu): Support different architectures.
|
||||
# The debian-base:v1.0.0 image built from kubernetes repository is based on
|
||||
# Debian Stretch. It includes systemd 232 with support for both +XZ and +LZ4
|
||||
# compression. +LZ4 is needed on some os distros such as COS.
|
||||
BASEIMAGE:=k8s.gcr.io/debian-base-amd64:v1.0.0
|
||||
ifeq ($(shell go env GOHOSTOS), darwin)
|
||||
ENABLE_JOURNALD=0
|
||||
else ifeq ($(shell go env GOHOSTOS), windows)
|
||||
ENABLE_JOURNALD=0
|
||||
endif
|
||||
|
||||
# Disable cgo by default to make the binary statically linked.
|
||||
CGO_ENABLED:=0
|
||||
|
||||
ifeq ($(GOARCH), arm64)
|
||||
CC:=aarch64-linux-gnu-gcc
|
||||
else
|
||||
CC:=x86_64-linux-gnu-gcc
|
||||
endif
|
||||
|
||||
# Set default Go architecture to AMD64.
|
||||
GOARCH ?= amd64
|
||||
|
||||
# Construct the "-tags" parameter used by "go build".
|
||||
BUILD_TAGS?=
|
||||
|
||||
LINUX_BUILD_TAGS = $(BUILD_TAGS)
|
||||
WINDOWS_BUILD_TAGS = $(BUILD_TAGS)
|
||||
|
||||
ifeq ($(OS),Windows_NT)
|
||||
HOST_PLATFORM_BUILD_TAGS = $(WINDOWS_BUILD_TAGS)
|
||||
else
|
||||
HOST_PLATFORM_BUILD_TAGS = $(LINUX_BUILD_TAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(ENABLE_JOURNALD), 1)
|
||||
# Enable journald build tag.
|
||||
BUILD_TAGS:=$(BUILD_TAGS) journald
|
||||
LINUX_BUILD_TAGS := journald $(BUILD_TAGS)
|
||||
# Enable cgo because sdjournal needs cgo to compile. The binary will be
|
||||
# dynamically linked if CGO_ENABLED is enabled. This is fine because fedora
|
||||
# already has necessary dynamic library. We can not use `-extldflags "-static"`
|
||||
# here, because go-systemd uses dlopen, and dlopen will not work properly in a
|
||||
# statically linked application.
|
||||
CGO_ENABLED:=1
|
||||
LOGCOUNTER=./bin/log-counter
|
||||
else
|
||||
# Hack: Don't copy over log-counter, use a wildcard path that shouldn't match
|
||||
# anything in COPY command.
|
||||
LOGCOUNTER=*dont-include-log-counter
|
||||
endif
|
||||
|
||||
vet:
|
||||
GO111MODULE=on go list -mod vendor -tags "$(BUILD_TAGS)" ./... | \
|
||||
go list -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./... | \
|
||||
grep -v "./vendor/*" | \
|
||||
GO111MODULE=on xargs go vet -mod vendor -tags "$(BUILD_TAGS)"
|
||||
xargs go vet -tags "$(HOST_PLATFORM_BUILD_TAGS)"
|
||||
|
||||
fmt:
|
||||
find . -type f -name "*.go" | grep -v "./vendor/*" | xargs gofmt -s -w -l
|
||||
|
@ -86,76 +124,137 @@ fmt:
|
|||
version:
|
||||
@echo $(VERSION)
|
||||
|
||||
BINARIES = bin/node-problem-detector bin/health-checker test/bin/problem-maker
|
||||
BINARIES_LINUX_ONLY =
|
||||
ifeq ($(ENABLE_JOURNALD), 1)
|
||||
BINARIES_LINUX_ONLY += bin/log-counter
|
||||
endif
|
||||
|
||||
ALL_BINARIES = $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), ./$(binary)) \
|
||||
$(foreach platform, $(LINUX_PLATFORMS), $(foreach binary, $(BINARIES) $(BINARIES_LINUX_ONLY), output/$(platform)/$(binary))) \
|
||||
$(foreach binary, $(BINARIES), output/windows_amd64/$(binary).exe)
|
||||
ALL_TARBALLS = $(foreach platform, $(PLATFORMS), $(NPD_NAME_VERSION)-$(platform).tar.gz)
|
||||
|
||||
output/windows_amd64/bin/%.exe: $(PKG_SOURCES)
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
./cmd/$(subst -,,$*)
|
||||
touch $@
|
||||
|
||||
output/windows_amd64/test/bin/%.exe: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=windows GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) go build \
|
||||
-o ../$@ \
|
||||
-tags "$(WINDOWS_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_amd64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/$(subst -,,$*)
|
||||
touch $@
|
||||
|
||||
output/linux_amd64/test/bin/%: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=x86_64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
output/linux_arm64/bin/%: $(PKG_SOURCES)
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
-o $@ \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/$(subst -,,$*)
|
||||
touch $@
|
||||
|
||||
output/linux_arm64/test/bin/%: $(PKG_SOURCES)
|
||||
cd test && \
|
||||
GOOS=linux GOARCH=arm64 CGO_ENABLED=$(CGO_ENABLED) \
|
||||
CC=aarch64-linux-gnu-gcc go build \
|
||||
-o ../$@ \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/$(subst -,,$*)
|
||||
|
||||
# In the future these targets should be deprecated.
|
||||
./bin/log-counter: $(PKG_SOURCES)
|
||||
ifeq ($(ENABLE_JOURNALD), 1)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/log-counter \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
cmd/logcounter/log_counter.go
|
||||
else
|
||||
echo "Warning: log-counter requires journald, skipping."
|
||||
endif
|
||||
|
||||
./bin/node-problem-detector: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/node-problem-detector \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./cmd/nodeproblemdetector
|
||||
|
||||
./test/bin/problem-maker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
-o test/bin/problem-maker \
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
./test/e2e/problemmaker/problem_maker.go
|
||||
cd test && \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/problem-maker \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
./e2e/problemmaker/problem_maker.go
|
||||
|
||||
./bin/health-checker: $(PKG_SOURCES)
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
|
||||
-mod vendor \
|
||||
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GOARCH=$(GOARCH) CC=$(CC) go build \
|
||||
-o bin/health-checker \
|
||||
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
|
||||
-tags "$(BUILD_TAGS)" \
|
||||
-tags "$(LINUX_BUILD_TAGS)" \
|
||||
cmd/healthchecker/health_checker.go
|
||||
|
||||
Dockerfile: Dockerfile.in
|
||||
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
|
||||
ifneq ($(ENABLE_JOURNALD), 1)
|
||||
sed -i '/Below command depends on ENABLE_JOURNAL=1/,+2d' $@
|
||||
echo "Warning: log-counter requires journald, skipping."
|
||||
endif
|
||||
|
||||
|
||||
test: vet fmt
|
||||
GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(BUILD_TAGS)" ./...
|
||||
go test -timeout=1m -v -race -short -tags "$(HOST_PLATFORM_BUILD_TAGS)" ./...
|
||||
|
||||
e2e-test: vet fmt build-tar
|
||||
GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(BUILD_TAGS)" -stream \
|
||||
./test/e2e/metriconly/... -- \
|
||||
cd test && \
|
||||
go run github.com/onsi/ginkgo/ginkgo -nodes=$(PARALLEL) -timeout=10m -v -tags "$(HOST_PLATFORM_BUILD_TAGS)" -stream \
|
||||
./e2e/metriconly/... -- \
|
||||
-project=$(PROJECT) -zone=$(ZONE) \
|
||||
-image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \
|
||||
-ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \
|
||||
-npd-build-tar=`pwd`/$(TARBALL) \
|
||||
-npd-build-tar=`pwd`/../$(TARBALL) \
|
||||
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
|
||||
-artifacts-dir=$(ARTIFACTS)
|
||||
|
||||
build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker
|
||||
$(NPD_NAME_VERSION)-%.tar.gz: $(ALL_BINARIES) test/e2e-install.sh
|
||||
mkdir -p output/$*/ output/$*/test/
|
||||
cp -r config/ output/$*/
|
||||
cp test/e2e-install.sh output/$*/test/e2e-install.sh
|
||||
(cd output/$*/ && tar -zcvf ../../$@ *)
|
||||
sha512sum $@ > $@.sha512
|
||||
|
||||
build-container: build-binaries Dockerfile
|
||||
docker build -t $(IMAGE) .
|
||||
build-binaries: $(ALL_BINARIES)
|
||||
|
||||
build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
|
||||
build-container: clean Dockerfile
|
||||
docker buildx create --platform $(DOCKER_PLATFORMS) --use
|
||||
docker buildx build --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
|
||||
$(TARBALL): ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
|
||||
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
|
||||
sha1sum $(TARBALL)
|
||||
md5sum $(TARBALL)
|
||||
|
||||
build-tar: $(TARBALL) $(ALL_TARBALLS)
|
||||
|
||||
build: build-container build-tar
|
||||
|
||||
docker-builder:
|
||||
docker build -t npd-builder ./builder
|
||||
docker build -t npd-builder . --target=builder
|
||||
|
||||
build-in-docker: clean docker-builder
|
||||
docker run \
|
||||
|
@ -163,17 +262,46 @@ build-in-docker: clean docker-builder
|
|||
-c 'cd /gopath/src/k8s.io/node-problem-detector/ && make build-binaries'
|
||||
|
||||
push-container: build-container
|
||||
# So we can push to docker hub by setting REGISTRY
|
||||
ifneq (,$(findstring gcr.io,$(REGISTRY)))
|
||||
gcloud auth configure-docker
|
||||
docker push $(IMAGE)
|
||||
endif
|
||||
# Build should be cached from build-container
|
||||
docker buildx build --push --platform $(DOCKER_PLATFORMS) -t $(IMAGE) --build-arg LOGCOUNTER=$(LOGCOUNTER) .
|
||||
|
||||
push-tar: build-tar
|
||||
gsutil cp $(TARBALL) $(UPLOAD_PATH)/node-problem-detector/
|
||||
gsutil cp node-problem-detector-$(VERSION)-*.tar.gz* $(UPLOAD_PATH)/node-problem-detector/
|
||||
|
||||
# `make push` is used by presubmit and CI jobs.
|
||||
push: push-container push-tar
|
||||
|
||||
# `make release` is used when releasing a new NPD version.
|
||||
release: push-container build-tar print-tar-sha-md5
|
||||
|
||||
print-tar-sha-md5: build-tar
|
||||
./hack/print-tar-sha-md5.sh $(VERSION)
|
||||
|
||||
coverage.out:
|
||||
rm -f coverage.out
|
||||
go test -coverprofile=coverage.out -timeout=1m -v -short ./...
|
||||
|
||||
clean:
|
||||
rm -f bin/health-checker
|
||||
rm -f bin/log-counter
|
||||
rm -f bin/node-problem-detector
|
||||
rm -f test/bin/problem-maker
|
||||
rm -f node-problem-detector-*.tar.gz
|
||||
rm -rf bin/
|
||||
rm -rf test/bin/
|
||||
rm -f node-problem-detector-*.tar.gz*
|
||||
rm -rf output/
|
||||
rm -f coverage.out
|
||||
|
||||
.PHONY: gomod
|
||||
gomod:
|
||||
go mod tidy
|
||||
go mod vendor
|
||||
cd test; go mod tidy
|
||||
|
||||
.PHONY: goget
|
||||
goget:
|
||||
go get $(shell go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all)
|
||||
|
||||
.PHONY: depup
|
||||
depup: goget gomod
|
||||
|
|
10
OWNERS
10
OWNERS
|
@ -1,12 +1,14 @@
|
|||
reviewers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- sig-node-reviewers
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
||||
- mmiranda96
|
||||
- hakman
|
||||
approvers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- sig-node-approvers
|
||||
- andyxning
|
||||
- wangzhen127
|
||||
- xueweiz
|
||||
- vteratipally
|
|
@ -0,0 +1,19 @@
|
|||
aliases:
|
||||
sig-node-approvers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
||||
- SergeyKanzhelev
|
||||
- tallclair
|
||||
sig-node-reviewers:
|
||||
- Random-Liu
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- yujuhong
|
||||
- sjenning
|
||||
- mrunalp
|
||||
- klueska
|
177
README.md
177
README.md
|
@ -3,31 +3,31 @@
|
|||
[](https://travis-ci.org/kubernetes/node-problem-detector) [](https://goreportcard.com/report/github.com/kubernetes/node-problem-detector)
|
||||
|
||||
node-problem-detector aims to make various node problems visible to the upstream
|
||||
layers in cluster management stack.
|
||||
It is a daemon which runs on each node, detects node
|
||||
layers in the cluster management stack.
|
||||
It is a daemon that runs on each node, detects node
|
||||
problems and reports them to apiserver.
|
||||
node-problem-detector can either run as a
|
||||
[DaemonSet](http://kubernetes.io/docs/admin/daemons/) or run standalone.
|
||||
[DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) or run standalone.
|
||||
Now it is running as a
|
||||
[Kubernetes Addon](https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
|
||||
enabled by default in the GCE cluster.
|
||||
|
||||
enabled by default in the GKE cluster. It is also enabled by default in AKS as part of the
|
||||
[AKS Linux Extension](https://learn.microsoft.com/en-us/azure/aks/faq#what-is-the-purpose-of-the-aks-linux-extension-i-see-installed-on-my-linux-vmss-instances).
|
||||
# Background
|
||||
|
||||
There are tons of node problems that could possibly affect the pods running on the
|
||||
node, such as:
|
||||
* Infrastructure daemon issues: ntp service down;
|
||||
* Hardware issues: Bad cpu, memory or disk;
|
||||
* Hardware issues: Bad CPU, memory or disk;
|
||||
* Kernel issues: Kernel deadlock, corrupted file system;
|
||||
* Container runtime issues: Unresponsive runtime daemon;
|
||||
* ...
|
||||
|
||||
Currently these problems are invisible to the upstream layers in cluster management
|
||||
Currently, these problems are invisible to the upstream layers in the cluster management
|
||||
stack, so Kubernetes will continue scheduling pods to the bad nodes.
|
||||
|
||||
To solve this problem, we introduced this new daemon **node-problem-detector** to
|
||||
collect node problems from various daemons and make them visible to the upstream
|
||||
layers. Once upstream layers have the visibility to those problems, we can discuss the
|
||||
layers. Once upstream layers have visibility to those problems, we can discuss the
|
||||
[remedy system](#remedy-systems).
|
||||
|
||||
# Problem API
|
||||
|
@ -41,11 +41,11 @@ should be reported as `Event`.
|
|||
|
||||
# Problem Daemon
|
||||
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors a specific
|
||||
kind of node problems and reports them to node-problem-detector.
|
||||
A problem daemon is a sub-daemon of node-problem-detector. It monitors specific
|
||||
kinds of node problems and reports them to node-problem-detector.
|
||||
|
||||
A problem daemon could be:
|
||||
* A tiny daemon designed for dedicated usecase of Kubernetes.
|
||||
* A tiny daemon designed for dedicated Kubernetes use-cases.
|
||||
* An existing node health monitoring daemon integrated with node-problem-detector.
|
||||
|
||||
Currently, a problem daemon is running as a goroutine in the node-problem-detector
|
||||
|
@ -57,24 +57,24 @@ corresponding build tags. If they are disabled at compilation time, then all the
|
|||
build dependencies, global variables and background goroutines will be trimmed out
|
||||
of the compiled executable.
|
||||
|
||||
List of supported problem daemons:
|
||||
List of supported problem daemons types:
|
||||
|
||||
| Problem Daemon | NodeCondition | Description | Disabling Build Tag |
|
||||
|----------------|:---------------:|:------------|:--------------------|
|
||||
| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problems and metrics according to predefined rules. | disable_system_log_monitor
|
||||
| [AbrtAdaptor](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) | None | Monitor ABRT log messages and report them further. ABRT (Automatic Bug Report Tool) is health monitoring daemon able to catch kernel problems as well as application crashes of various kinds occurred on the host. For more information visit the [link](https://github.com/abrt). | disable_system_log_monitor
|
||||
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json) | On-demand(According to users configuration) | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user defined check scripts. See proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | disable_custom_plugin_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | disable_system_stats_monitor
|
||||
| Problem Daemon Types | NodeCondition | Description | Configs | Disabling Build Tag |
|
||||
|----------------|:---------------:|:------------|:--------|:--------------------|
|
||||
| [SystemLogMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemlogmonitor) | KernelDeadlock ReadonlyFilesystem FrequentKubeletRestart FrequentDockerRestart FrequentContainerdRestart | A system log monitor monitors system log and reports problems and metrics according to predefined rules. | [filelog](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-filelog.json), [kmsg](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json), [kernel](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor-counter.json) [abrt](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) [systemd](https://github.com/kubernetes/node-problem-detector/blob/master/config/systemd-monitor-counter.json) | disable_system_log_monitor
|
||||
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See the proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | [system-stats-monitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | disable_system_stats_monitor
|
||||
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor) | On-demand(According to users configuration), existing example: NTPProblem | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user-defined check scripts. See the proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | [example](https://github.com/kubernetes/node-problem-detector/blob/4ad49bbd84b8ced45ac825eac01ec93d9235935e/config/custom-plugin-monitor.json) | disable_custom_plugin_monitor
|
||||
| [HealthChecker](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/healthchecker) | KubeletUnhealthy ContainerRuntimeUnhealthy| A health checker for node-problem-detector to check kubelet and container runtime health. | [kubelet](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-kubelet.json) [docker](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-docker.json) [containerd](https://github.com/kubernetes/node-problem-detector/blob/master/config/health-checker-containerd.json) |
|
||||
|
||||
# Exporter
|
||||
|
||||
An exporter is a component of node-problem-detector. It reports node problems and/or metrics to
|
||||
certain back end. Some of them can be disable at compile time using a build tag. List of supported exporters:
|
||||
certain backends. Some of them can be disabled at compile-time using a build tag. List of supported exporters:
|
||||
|
||||
| Exporter |Description | Disabling Build Tag |
|
||||
|----------|:-----------|:--------------------|
|
||||
| Kubernetes exporter | Kubernetes exporter reports node problems to Kubernetes API server: temporary problems get reported as Events, and permanent problems get reported as Node Conditions. |
|
||||
| Prometheus exporter | Prometheus exporter reports node problems and metrics locally as Prometheus metrics |
|
||||
| Kubernetes exporter | Kubernetes exporter reports node problems to Kubernetes API server: temporary problems get reported as Events, and permanent problems get reported as Node Conditions. |
|
||||
| Prometheus exporter | Prometheus exporter reports node problems and metrics locally as Prometheus metrics |
|
||||
| [Stackdriver exporter](https://github.com/kubernetes/node-problem-detector/blob/master/config/exporter/stackdriver-exporter.json) | Stackdriver exporter reports node problems and metrics to Stackdriver Monitoring API. | disable_stackdriver_exporter
|
||||
|
||||
# Usage
|
||||
|
@ -86,37 +86,42 @@ certain back end. Some of them can be disable at compile time using a build tag.
|
|||
|
||||
#### For System Log Monitor
|
||||
|
||||
* `--config.system-log-monitor`: List of paths to system log monitor configuration files, comma separated, e.g.
|
||||
* `--config.system-log-monitor`: List of paths to system log monitor configuration files, comma-separated, e.g.
|
||||
[config/kernel-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json).
|
||||
Node problem detector will start a separate log monitor for each configuration. You can
|
||||
use different log monitors to monitor different system log.
|
||||
use different log monitors to monitor different system logs.
|
||||
|
||||
#### For System Stats Monitor
|
||||
|
||||
* `--config.system-stats-monitor`: List of paths to system stats monitor config files, comma separated, e.g.
|
||||
* `--config.system-stats-monitor`: List of paths to system stats monitor config files, comma-separated, e.g.
|
||||
[config/system-stats-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).
|
||||
Node problem detector will start a separate system stats monitor for each configuration. You can
|
||||
use different system stats monitors to monitor different problem-related system stats.
|
||||
|
||||
#### For Custom Plugin Monitor
|
||||
|
||||
* `--config.custom-plugin-monitor`: List of paths to custom plugin monitor config files, comma separated, e.g.
|
||||
* `--config.custom-plugin-monitor`: List of paths to custom plugin monitor config files, comma-separated, e.g.
|
||||
[config/custom-plugin-monitor.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json).
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
Node problem detector will start a separate custom plugin monitor for each configuration. You can
|
||||
use different custom plugin monitors to monitor different node problems.
|
||||
|
||||
|
||||
#### For Health Checkers
|
||||
|
||||
Health checkers are configured as custom plugins, using the config/health-checker-*.json config files.
|
||||
|
||||
#### For Kubernetes exporter
|
||||
|
||||
* `--enable-k8s-exporter`: Enables reporting to Kubernetes API server, default to `true`.
|
||||
* `--apiserver-override`: A URI parameter used to customize how node-problem-detector
|
||||
connects the apiserver. This is ignored if `--enable-k8s-exporter` is `false`. The format is same as the
|
||||
connects the apiserver. This is ignored if `--enable-k8s-exporter` is `false`. The format is the same as the
|
||||
[`source`](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes)
|
||||
flag of [Heapster](https://github.com/kubernetes/heapster).
|
||||
For example, to run without auth, use the following config:
|
||||
```
|
||||
http://APISERVER_IP:APISERVER_PORT?inClusterConfig=false
|
||||
```
|
||||
Refer [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
|
||||
Refer to [heapster docs](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes) for a complete list of available options.
|
||||
* `--address`: The address to bind the node problem detector server.
|
||||
* `--port`: The port to bind the node problem detector server. Use 0 to disable.
|
||||
|
||||
|
@ -127,22 +132,22 @@ For example, to run without auth, use the following config:
|
|||
|
||||
#### For Stackdriver exporter
|
||||
|
||||
* `--exporter.stackdriver`: Path to a Stackdriver exporter config file, e.g. [config/exporter/stackdriver-exporter.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/exporter/stackdriver-exporter.json), default to empty string. Set to empty string to disable.
|
||||
* `--exporter.stackdriver`: Path to a Stackdriver exporter config file, e.g. [config/exporter/stackdriver-exporter.json](https://github.com/kubernetes/node-problem-detector/blob/master/config/exporter/stackdriver-exporter.json), defaults to empty string. Set to empty string to disable.
|
||||
|
||||
### Deprecated Flags
|
||||
|
||||
* `--system-log-monitors`: List of paths to system log monitor config files, comma separated. This option is deprecated, replaced by `--config.system-log-monitor`, and will be removed. NPD will panic if both `--system-log-monitors` and `--config.system-log-monitor` are set.
|
||||
* `--system-log-monitors`: List of paths to system log monitor config files, comma-separated. This option is deprecated, replaced by `--config.system-log-monitor`, and will be removed. NPD will panic if both `--system-log-monitors` and `--config.system-log-monitor` are set.
|
||||
|
||||
* `--custom-plugin-monitors`: List of paths to custom plugin monitor config files, comma separated. This option is deprecated, replaced by `--config.custom-plugin-monitor`, and will be removed. NPD will panic if both `--custom-plugin-monitors` and `--config.custom-plugin-monitor` are set.
|
||||
* `--custom-plugin-monitors`: List of paths to custom plugin monitor config files, comma-separated. This option is deprecated, replaced by `--config.custom-plugin-monitor`, and will be removed. NPD will panic if both `--custom-plugin-monitors` and `--config.custom-plugin-monitor` are set.
|
||||
|
||||
## Build Image
|
||||
|
||||
* `go get` or `git clone` node-problem-detector repo into `$GOPATH/src/k8s.io` or `$GOROOT/src/k8s.io`
|
||||
with one of the below directions:
|
||||
* `cd $GOPATH/src/k8s.io && git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
* `cd $GOPATH/src/k8s.io && go get k8s.io/node-problem-detector`
|
||||
* Install development dependencies for `libsystemd` and the ARM GCC toolchain
|
||||
* Debian/Ubuntu: `apt install libsystemd-dev gcc-aarch64-linux-gnu`
|
||||
|
||||
* run `make` in the top directory. It will:
|
||||
* `git clone git@github.com:kubernetes/node-problem-detector.git`
|
||||
|
||||
* Run `make` in the top directory. It will:
|
||||
* Build the binary.
|
||||
* Build the docker image. The binary and `config/` are copied into the docker image.
|
||||
|
||||
|
@ -153,39 +158,39 @@ before running `make`. For example:
|
|||
|
||||
`BUILD_TAGS="disable_custom_plugin_monitor disable_system_stats_monitor" make`
|
||||
|
||||
Above command will compile the node-problem-detector without [Custom Plugin Monitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor)
|
||||
The above command will compile the node-problem-detector without [Custom Plugin Monitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/custompluginmonitor)
|
||||
and [System Stats Monitor](https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor).
|
||||
Check out the [Problem Daemon](https://github.com/kubernetes/node-problem-detector#problem-daemon) section
|
||||
to see how to disable each problem daemon during compilation time.
|
||||
|
||||
**Note**:
|
||||
By default node-problem-detector will be built with systemd support with `make` command. This requires systemd develop files.
|
||||
You should download the systemd develop files first. For Ubuntu, `libsystemd-journal-dev` package should
|
||||
be installed. For Debian, `libsystemd-dev` package should be installed.
|
||||
|
||||
## Push Image
|
||||
|
||||
`make push` uploads the docker image to registry. By default, the image will be uploaded to
|
||||
`make push` uploads the docker image to a registry. By default, the image will be uploaded to
|
||||
`staging-k8s.gcr.io`. It's easy to modify the `Makefile` to push the image
|
||||
to another registry.
|
||||
|
||||
## Installation
|
||||
|
||||
The easiest way to install node-problem-detector into your cluster is to use the [Helm](https://helm.sh/) [chart](https://github.com/helm/charts/tree/master/stable/node-problem-detector):
|
||||
The easiest way to install node-problem-detector into your cluster is to use the [Helm](https://helm.sh/) [chart](https://github.com/deliveryhero/helm-charts/tree/master/stable/node-problem-detector):
|
||||
|
||||
```
|
||||
helm install stable/node-problem-detector
|
||||
helm repo add deliveryhero https://charts.deliveryhero.io/
|
||||
helm install --generate-name deliveryhero/node-problem-detector
|
||||
```
|
||||
|
||||
Or alternatively, to install node-problem-detector manually:
|
||||
Alternatively, to install node-problem-detector manually:
|
||||
|
||||
1. Edit [node-problem-detector.yaml](deployment/node-problem-detector.yaml) to fit your environment. Set `log` volume to your system log directory (used by SystemLogMonitor). You can use a ConfigMap to overwrite the `config` directory inside the pod.
|
||||
|
||||
2. Edit [node-problem-detector-config.yaml](deployment/node-problem-detector-config.yaml) to configure node-problem-detector.
|
||||
|
||||
3. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
3. Edit [rbac.yaml](deployment/rbac.yaml) to fit your environment.
|
||||
|
||||
3. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
4. Create the ServiceAccount and ClusterRoleBinding with `kubectl create -f rbac.yaml`.
|
||||
|
||||
4. Create the ConfigMap with `kubectl create -f node-problem-detector-config.yaml`.
|
||||
|
||||
5. Create the DaemonSet with `kubectl create -f node-problem-detector.yaml`.
|
||||
|
||||
## Start Standalone
|
||||
|
||||
|
@ -199,6 +204,42 @@ node-problem-detector --apiserver-override=http://APISERVER_IP:APISERVER_INSECUR
|
|||
|
||||
For more scenarios, see [here](https://github.com/kubernetes/heapster/blob/master/docs/source-configuration.md#kubernetes)
|
||||
|
||||
## Windows
|
||||
|
||||
Node Problem Detector has preliminary support Windows. Most of the functionality has not been tested but filelog plugin works.
|
||||
|
||||
Follow [Issue #461](https://github.com/kubernetes/node-problem-detector/issues/461) for development status of Windows support.
|
||||
|
||||
### Development
|
||||
|
||||
To develop NPD on Windows you'll need to setup your Windows machine for Go development. Install the following tools:
|
||||
|
||||
* [Git for Windows](https://git-scm.com/)
|
||||
* [Go](https://golang.org/)
|
||||
* [Visual Studio Code](https://code.visualstudio.com/)
|
||||
* [Make](http://gnuwin32.sourceforge.net/packages/make.htm)
|
||||
* [mingw-64 WinBuilds](http://mingw-w64.org/downloads)
|
||||
* Tested with x86-64 Windows Native mode.
|
||||
* Add the `$InstallDir\bin` to [Windows `PATH` variable](https://answers.microsoft.com/en-us/windows/forum/windows_10-other_settings-winpc/adding-path-variable/97300613-20cb-4d85-8d0e-cc9d3549ba23).
|
||||
|
||||
```powershell
|
||||
# Run these commands in the node-problem-detector directory.
|
||||
|
||||
# Build in MINGW64 Window
|
||||
make clean ENABLE_JOURNALD=0 build-binaries
|
||||
|
||||
# Test in MINGW64 Window
|
||||
make test
|
||||
|
||||
# Run with containerd log monitoring enabled in Command Prompt. (Assumes containerd is installed.)
|
||||
%CD%\output\windows_amd64\bin\node-problem-detector.exe --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=%CD%\config\windows-containerd-monitor-filelog.json --config.system-stats-monitor=config\windows-system-stats-monitor.json
|
||||
|
||||
# Configure NPD to run as a Windows Service
|
||||
sc.exe create NodeProblemDetector binpath= "%CD%\node-problem-detector.exe [FLAGS]" start= demand
|
||||
sc.exe failure NodeProblemDetector reset= 0 actions= restart/10000
|
||||
sc.exe start NodeProblemDetector
|
||||
```
|
||||
|
||||
## Try It Out
|
||||
|
||||
You can try node-problem-detector in a running cluster by injecting messages to the logs that node-problem-detector is watching. For example, Let's assume node-problem-detector is using [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json). On your workstation, run ```kubectl get events -w```. On the node, run ```sudo sh -c "echo 'kernel: BUG: unable to handle kernel NULL pointer dereference at TESTING' >> /dev/kmsg"```. Then you should see the ```KernelOops``` event.
|
||||
|
@ -215,7 +256,7 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
|
|||
6. ```sudo sh -c "echo 'kernel: INFO: task docker:20744 blocked for more than 120 seconds.' >> /dev/kmsg"```
|
||||
7. You can see ```DockerHung``` event and condition in the node-problem-detector log.
|
||||
8. You can see ```DockerHung``` condition at [http://127.0.0.1:20256/conditions](http://127.0.0.1:20256/conditions).
|
||||
9. You can see disk related system metrics in Prometheus format at [http://127.0.0.1:20257/metrics](http://127.0.0.1:20257/metrics).
|
||||
9. You can see disk-related system metrics in Prometheus format at [http://127.0.0.1:20257/metrics](http://127.0.0.1:20257/metrics).
|
||||
|
||||
**Note**:
|
||||
- You can see more rule examples under [test/kernel_log_generator/problems](https://github.com/kubernetes/node-problem-detector/tree/master/test/kernel_log_generator/problems).
|
||||
|
@ -227,9 +268,9 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
|
|||
node-problem-detector uses [go modules](https://github.com/golang/go/wiki/Modules)
|
||||
to manage dependencies. Therefore, building node-problem-detector requires
|
||||
golang 1.11+. It still uses vendoring. See the
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/blob/master/keps/sig-architecture/2019-03-19-go-modules.md#alternatives-to-vendoring-using-go-modules)
|
||||
[Kubernetes go modules KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-architecture/917-go-modules#alternatives-to-vendoring-using-go-modules)
|
||||
for the design decisions. To add a new dependency, update [go.mod](go.mod) and
|
||||
run `GO111MODULE=on go mod vendor`.
|
||||
run `go mod vendor`.
|
||||
|
||||
# Remedy Systems
|
||||
|
||||
|
@ -238,41 +279,41 @@ detected by the node-problem-detector. Remedy systems observe events and/or node
|
|||
conditions emitted by the node-problem-detector and take action to return the
|
||||
Kubernetes cluster to a healthy state. The following remedy systems exist:
|
||||
|
||||
* [**Draino**](https://github.com/planetlabs/draino) automatically drains Kubernetes
|
||||
nodes based on labels and node conditions. Nodes that match _all_ of the supplied
|
||||
labels and _any_ of the supplied node conditions will be prevented from accepting
|
||||
new pods (aka 'cordoned') immediately, and
|
||||
[drained](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
|
||||
after a configurable time. Draino can be used in conjunction with the
|
||||
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
to automatically terminate drained nodes. Refer to
|
||||
[this issue](https://github.com/kubernetes/node-problem-detector/issues/199)
|
||||
for an example production use case for Draino.
|
||||
* [**Descheduler**](https://github.com/kubernetes-sigs/descheduler) strategy RemovePodsViolatingNodeTaints
|
||||
evicts pods violating NoSchedule taints on nodes. The k8s scheduler's TaintNodesByCondition feature must
|
||||
be enabled. The [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler)
|
||||
can be used to automatically terminate drained nodes.
|
||||
* [**mediK8S**](https://github.com/medik8s) is an umbrella project for automatic remediation
|
||||
system build on [Node Health Check Operator (NHC)](https://github.com/medik8s/node-healthcheck-operator) that monitors
|
||||
node conditions and delegates remediation to external remediators using the Remediation API.[Poison-Pill](https://github.com/medik8s/poison-pill)
|
||||
is a remediator that will reboot the node and make sure all statefull workloads are rescheduled. NHC supports conditionally remediating if the cluster
|
||||
has enough healthy capacity, or manually pausing any action to minimze cluster disruption.
|
||||
* [**MachineHealthCheck**](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-health-check) of [Cluster API](https://cluster-api.sigs.k8s.io/) are responsible for remediating unhealthy Machines.
|
||||
|
||||
# Testing
|
||||
|
||||
NPD is tested via unit tests, [NPD e2e tests](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md), Kubernetes e2e tests and Kubernetes nodes e2e tests. Prow handles the [pre-submit tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-presubmits.yaml) and [CI tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-ci.yaml).
|
||||
|
||||
CI test results can be found at below:
|
||||
1. [Unit tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
CI test results can be found below:
|
||||
1. [Unit tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-test)
|
||||
2. [NPD e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-test)
|
||||
3. [Kubernetes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
|
||||
4. [Kubernetes nodes e2e tests](https://testgrid.k8s.io/sig-node-node-problem-detector#ci-npd-e2e-node)
|
||||
|
||||
## Running tests
|
||||
|
||||
Unit test is ran via `make test`.
|
||||
Unit tests are run via `make test`.
|
||||
|
||||
See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md) for how to setup and run NPD e2e tests.
|
||||
See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md) for how to set up and run NPD e2e tests.
|
||||
|
||||
## Problem Maker
|
||||
|
||||
[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY intended to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems.
|
||||
|
||||
# Compatibility
|
||||
|
||||
Node problem detector's architecture has been fairly stable. Recent versions (v0.8.13+) should be able to work with any supported kubernetes versions.
|
||||
|
||||
# Docs
|
||||
|
||||
* [Custom plugin monitor](docs/custom_plugin_monitor.md)
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
# Copyright 2018 The Kubernetes Authors. All rights reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM golang:1.11.0
|
||||
LABEL maintainer="Andy Xie <andy.xning@gmail.com>"
|
||||
|
||||
ENV GOPATH /gopath/
|
||||
ENV PATH $GOPATH/bin:$PATH
|
||||
|
||||
RUN apt-get update && apt-get --yes install libsystemd-dev
|
||||
RUN go version
|
||||
RUN go get github.com/tools/godep
|
||||
RUN godep version
|
||||
CMD ["/bin/bash"]
|
|
@ -0,0 +1,26 @@
|
|||
# See https://cloud.google.com/cloud-build/docs/build-config
|
||||
|
||||
# this must be specified in seconds. If omitted, defaults to 600s (10 mins)
|
||||
timeout: 3600s
|
||||
options:
|
||||
# job builds a multi-arch docker image for amd64 and arm64
|
||||
machineType: E2_HIGHCPU_8
|
||||
steps:
|
||||
- name: 'gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20230623-56e06d7c18'
|
||||
entrypoint: bash
|
||||
env:
|
||||
- PROW_GIT_TAG=$_GIT_TAG
|
||||
- PULL_BASE_REF=$_PULL_BASE_REF
|
||||
- VERSION=$_PULL_BASE_REF
|
||||
- DOCKER_CLI_EXPERIMENTAL=enabled
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
echo "Building/Pushing NPD containers"
|
||||
apk add musl-dev gcc
|
||||
make push-container
|
||||
substitutions:
|
||||
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
|
||||
# can be used as a substitution
|
||||
_GIT_TAG: 'PLACE_HOLDER'
|
||||
_PULL_BASE_REF: 'master'
|
|
@ -23,17 +23,24 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
hco := options.NewHealthCheckerOptions()
|
||||
hco.AddFlags(pflag.CommandLine)
|
||||
|
@ -49,8 +56,13 @@ func main() {
|
|||
fmt.Println(err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
if !hc.CheckHealth() {
|
||||
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
|
||||
healthy, err := hc.CheckHealth()
|
||||
if err != nil {
|
||||
fmt.Printf("error checking %v health: %v\n", hco.Component, err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
if !healthy {
|
||||
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.Service, hco.EnableRepair)
|
||||
os.Exit(int(types.NonOK))
|
||||
}
|
||||
os.Exit(int(types.OK))
|
||||
|
|
|
@ -19,6 +19,7 @@ package options
|
|||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
|
@ -34,41 +35,57 @@ func NewHealthCheckerOptions() *HealthCheckerOptions {
|
|||
// HealthCheckerOptions are the options used to configure the health checker.
|
||||
type HealthCheckerOptions struct {
|
||||
Component string
|
||||
SystemdService string
|
||||
Service string
|
||||
EnableRepair bool
|
||||
CriCtlPath string
|
||||
CriSocketPath string
|
||||
CriTimeout time.Duration
|
||||
CoolDownTime time.Duration
|
||||
LoopBackTime time.Duration
|
||||
HealthCheckTimeout time.Duration
|
||||
LogPatterns types.LogPatternFlag
|
||||
}
|
||||
|
||||
// AddFlags adds health checker command line options to pflag.
|
||||
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
|
||||
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
|
||||
"The component to check health for. Supports kubelet, docker and cri")
|
||||
fs.StringVar(&hco.SystemdService, "systemd-service", "",
|
||||
"The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
|
||||
"The component to check health for. Supports kubelet, docker, kube-proxy, and cri")
|
||||
// Deprecated: For backward compatibility on linux environment. Going forward "service" will be used instead of systemd-service
|
||||
if runtime.GOOS == "linux" {
|
||||
fs.MarkDeprecated("systemd-service", "please use --service flag instead")
|
||||
fs.StringVar(&hco.Service, "systemd-service", "",
|
||||
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
|
||||
}
|
||||
fs.StringVar(&hco.Service, "service", "",
|
||||
"The underlying service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
|
||||
fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.")
|
||||
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
|
||||
"The path to the crictl binary. This is used to check health of cri component.")
|
||||
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
|
||||
"The path to the cri socket. Used with crictl to specify the socket path.")
|
||||
fs.DurationVar(&hco.CriTimeout, "cri-timeout", types.DefaultCriTimeout,
|
||||
"The duration to wait for crictl to run.")
|
||||
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
|
||||
"The duration to wait for the service to be up before attempting repair.")
|
||||
fs.DurationVar(&hco.LoopBackTime, "loopback-time", types.DefaultLoopBackTime,
|
||||
"The duration to loop back, if it is 0, health-check will check from start time.")
|
||||
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
|
||||
"The time to wait before marking the component as unhealthy.")
|
||||
fs.Var(&hco.LogPatterns, "log-pattern",
|
||||
"The log pattern to look for in service journald logs. The format for flag value <failureThresholdCount>:<logPattern>")
|
||||
}
|
||||
|
||||
// IsValid validates health checker command line options.
|
||||
// Returns error if invalid, nil otherwise.
|
||||
func (hco *HealthCheckerOptions) IsValid() error {
|
||||
// Make sure the component specified is valid.
|
||||
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent {
|
||||
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri>")
|
||||
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent &&
|
||||
hco.Component != types.CRIComponent && hco.Component != types.KubeProxyComponent {
|
||||
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri/kube-proxy>")
|
||||
}
|
||||
// Make sure the systemd service is specified if repair is enabled.
|
||||
if hco.EnableRepair && hco.SystemdService == "" {
|
||||
return fmt.Errorf("systemd-service cannot be empty when repair is enabled")
|
||||
// Make sure the service is specified if repair is enabled.
|
||||
if hco.EnableRepair && hco.Service == "" {
|
||||
return fmt.Errorf("service cannot be empty when repair is enabled")
|
||||
}
|
||||
// Skip checking further if the component is not cri.
|
||||
if hco.Component != types.CRIComponent {
|
||||
|
@ -87,14 +104,14 @@ func (hco *HealthCheckerOptions) IsValid() error {
|
|||
|
||||
// SetDefaults sets the defaults values for the dependent flags.
|
||||
func (hco *HealthCheckerOptions) SetDefaults() {
|
||||
if hco.SystemdService != "" {
|
||||
if hco.Service != "" {
|
||||
return
|
||||
}
|
||||
if hco.Component != types.CRIComponent {
|
||||
hco.SystemdService = hco.Component
|
||||
hco.Service = hco.Component
|
||||
return
|
||||
}
|
||||
hco.SystemdService = types.ContainerdService
|
||||
hco.Service = types.ContainerdService
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
|
|
@ -56,9 +56,9 @@ func TestIsValid(t *testing.T) {
|
|||
{
|
||||
name: "empty systemd-service and repair enabled",
|
||||
hco: HealthCheckerOptions{
|
||||
Component: types.KubeletComponent,
|
||||
EnableRepair: true,
|
||||
SystemdService: "",
|
||||
Component: types.KubeletComponent,
|
||||
EnableRepair: true,
|
||||
Service: "",
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build journald
|
||||
// +build journald
|
||||
|
||||
/*
|
||||
|
@ -25,17 +26,24 @@ import (
|
|||
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/logcounter/options"
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/logcounter"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Set glog flag so that it does not log to files.
|
||||
if err := flag.Set("logtostderr", "true"); err != nil {
|
||||
fmt.Printf("Failed to set logtostderr=true: %v", err)
|
||||
os.Exit(int(types.Unknown))
|
||||
}
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
fedo := options.NewLogCounterOptions()
|
||||
fedo.AddFlags(pflag.CommandLine)
|
||||
|
|
|
@ -34,6 +34,7 @@ type LogCounterOptions struct {
|
|||
Lookback string
|
||||
Delay string
|
||||
Pattern string
|
||||
RevertPattern string
|
||||
Count int
|
||||
}
|
||||
|
||||
|
@ -46,6 +47,8 @@ func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
"The time duration log watcher delays after node boot time. This is useful when log watcher needs to wait for some time until the node is stable.")
|
||||
fs.StringVar(&fedo.Pattern, "pattern", "",
|
||||
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
|
||||
fs.StringVar(&fedo.RevertPattern, "revert-pattern", "",
|
||||
"Similar to --pattern but conversely it decreases count value for every match. This is useful to discount a log when another log occurs.")
|
||||
fs.IntVar(&fedo.Count, "count", 1,
|
||||
"The number of times the pattern must be found to trigger the condition")
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,10 +17,9 @@ limitations under the License.
|
|||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"context"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/exporterplugins"
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/problemdaemonplugins"
|
||||
|
@ -34,15 +33,10 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
||||
func main() {
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
|
||||
func npdMain(ctx context.Context, npdo *options.NodeProblemDetectorOptions) error {
|
||||
if npdo.PrintVersion {
|
||||
version.PrintVersion()
|
||||
os.Exit(0)
|
||||
return nil
|
||||
}
|
||||
|
||||
npdo.SetNodeNameOrDie()
|
||||
|
@ -52,18 +46,18 @@ func main() {
|
|||
// Initialize problem daemons.
|
||||
problemDaemons := problemdaemon.NewProblemDaemons(npdo.MonitorConfigPaths)
|
||||
if len(problemDaemons) == 0 {
|
||||
glog.Fatalf("No problem daemon is configured")
|
||||
klog.Fatalf("No problem daemon is configured")
|
||||
}
|
||||
|
||||
// Initialize exporters.
|
||||
defaultExporters := []types.Exporter{}
|
||||
if ke := k8sexporter.NewExporterOrDie(npdo); ke != nil {
|
||||
if ke := k8sexporter.NewExporterOrDie(ctx, npdo); ke != nil {
|
||||
defaultExporters = append(defaultExporters, ke)
|
||||
glog.Info("K8s exporter started.")
|
||||
klog.Info("K8s exporter started.")
|
||||
}
|
||||
if pe := prometheusexporter.NewExporterOrDie(npdo); pe != nil {
|
||||
defaultExporters = append(defaultExporters, pe)
|
||||
glog.Info("Prometheus exporter started.")
|
||||
klog.Info("Prometheus exporter started.")
|
||||
}
|
||||
|
||||
plugableExporters := exporters.NewExporters()
|
||||
|
@ -73,12 +67,10 @@ func main() {
|
|||
npdExporters = append(npdExporters, plugableExporters...)
|
||||
|
||||
if len(npdExporters) == 0 {
|
||||
glog.Fatalf("No exporter is successfully setup")
|
||||
klog.Fatalf("No exporter is successfully setup")
|
||||
}
|
||||
|
||||
// Initialize NPD core.
|
||||
p := problemdetector.NewProblemDetector(problemDaemons, npdExporters)
|
||||
if err := p.Run(); err != nil {
|
||||
glog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
return p.Run(ctx)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/exporterplugins"
|
||||
_ "k8s.io/node-problem-detector/cmd/nodeproblemdetector/problemdaemonplugins"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/exporters"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
||||
const (
|
||||
fakeConfigFilePattern = `
|
||||
{
|
||||
"plugin": "filelog",
|
||||
"pluginConfig": {
|
||||
"timestamp": "^time=\"(\\S*)\"",
|
||||
"message": "msg=\"([^\n]*)\"",
|
||||
"timestampFormat": "2006-01-02T15:04:05.999999999-07:00"
|
||||
},
|
||||
"logPath": "%s",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "containerd",
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "MissingPigz",
|
||||
"pattern": "unpigz not found.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "IncompatibleContainer",
|
||||
"pattern": ".*CreateComputeSystem.*"
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
)
|
||||
|
||||
func init() {
|
||||
exporters.Register("nil", types.ExporterHandler{
|
||||
CreateExporterOrDie: func(types.CommandLineOptions) types.Exporter {
|
||||
return &nullExporter{}
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
type nullExporter struct {
|
||||
}
|
||||
|
||||
func (ne *nullExporter) ExportProblems(*types.Status) {
|
||||
}
|
||||
|
||||
func TestNPDMain(t *testing.T) {
|
||||
npdo, cleanup := setupNPD(t)
|
||||
defer cleanup()
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
cancelFunc()
|
||||
if err := npdMain(ctx, npdo); err != nil {
|
||||
t.Errorf("termination signal should not return error got, %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, ext string, contents string) (string, error) {
|
||||
f, err := os.CreateTemp("", "*."+ext)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("cannot create temp file, %v", err)
|
||||
}
|
||||
|
||||
fileName := f.Name()
|
||||
|
||||
if err := os.WriteFile(fileName, []byte(contents), 0644); err != nil {
|
||||
os.Remove(fileName)
|
||||
return "", fmt.Errorf("cannot write config to temp file %s, %v", fileName, err)
|
||||
}
|
||||
|
||||
return fileName, nil
|
||||
}
|
||||
|
||||
func setupNPD(t *testing.T) (*options.NodeProblemDetectorOptions, func()) {
|
||||
fakeLogFileName, err := writeTempFile(t, "log", "")
|
||||
if err != nil {
|
||||
os.Remove(fakeLogFileName)
|
||||
t.Fatalf("cannot create temp config file, %v", err)
|
||||
}
|
||||
|
||||
fakeConfigFileContents := fmt.Sprintf(fakeConfigFilePattern, strings.ReplaceAll(fakeLogFileName, "\\", "\\\\"))
|
||||
|
||||
fakeConfigFileName, err := writeTempFile(t, "json", fakeConfigFileContents)
|
||||
if err != nil {
|
||||
os.Remove(fakeLogFileName)
|
||||
os.Remove(fakeConfigFileName)
|
||||
t.Fatalf("cannot create temp config file, %v", err)
|
||||
}
|
||||
|
||||
return &options.NodeProblemDetectorOptions{
|
||||
MonitorConfigPaths: map[types.ProblemDaemonType]*[]string{
|
||||
"system-log-monitor": {
|
||||
fakeConfigFileName,
|
||||
},
|
||||
},
|
||||
}, func() {
|
||||
os.Remove(fakeLogFileName)
|
||||
os.Remove(fakeConfigFileName)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
if err := npdMain(context.Background(), npdo); err != nil {
|
||||
klog.Fatalf("Problem detector failed with error: %v", err)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/pflag"
|
||||
"golang.org/x/sys/windows/svc"
|
||||
"golang.org/x/sys/windows/svc/debug"
|
||||
"golang.org/x/sys/windows/svc/eventlog"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
)
|
||||
|
||||
const (
|
||||
svcName = "NodeProblemDetector"
|
||||
svcDescription = "Identifies problems that likely disrupt the operation of Kubernetes workloads."
|
||||
svcCommandsAccepted = svc.AcceptStop | svc.AcceptShutdown
|
||||
appEventLogName = svcName
|
||||
windowsEventLogID = 1
|
||||
)
|
||||
|
||||
var (
|
||||
elog debug.Log
|
||||
)
|
||||
|
||||
func main() {
|
||||
klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
|
||||
klog.InitFlags(klogFlags)
|
||||
klogFlags.VisitAll(func(f *flag.Flag) {
|
||||
switch f.Name {
|
||||
case "v", "vmodule", "logtostderr":
|
||||
flag.CommandLine.Var(f.Value, f.Name, f.Usage)
|
||||
}
|
||||
})
|
||||
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
|
||||
pflag.CommandLine.MarkHidden("vmodule")
|
||||
pflag.CommandLine.MarkHidden("logtostderr")
|
||||
|
||||
npdo := options.NewNodeProblemDetectorOptions()
|
||||
npdo.AddFlags(pflag.CommandLine)
|
||||
|
||||
pflag.Parse()
|
||||
|
||||
handler := &npdService{
|
||||
options: npdo,
|
||||
}
|
||||
|
||||
runFunc := initializeRun()
|
||||
|
||||
if err := runFunc(svcName, handler); err != nil {
|
||||
elog.Error(windowsEventLogID, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func isRunningAsWindowsService() bool {
|
||||
runningAsService, err := svc.IsWindowsService()
|
||||
if err != nil {
|
||||
klog.Errorf("cannot determine if running as Windows Service assuming standalone, %v", err)
|
||||
return false
|
||||
}
|
||||
return runningAsService
|
||||
}
|
||||
|
||||
func setupLogging(runningAsService bool) {
|
||||
if runningAsService {
|
||||
var err error
|
||||
|
||||
elog, err = eventlog.Open(appEventLogName)
|
||||
|
||||
// If the event log is unavailable then at least print out to standard output.
|
||||
if err != nil {
|
||||
elog = debug.New(appEventLogName)
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("cannot connect to event log using standard out, %v", err))
|
||||
}
|
||||
} else {
|
||||
elog = debug.New(appEventLogName)
|
||||
}
|
||||
}
|
||||
|
||||
func initializeRun() func(string, svc.Handler) error {
|
||||
runningAsService := isRunningAsWindowsService()
|
||||
|
||||
setupLogging(runningAsService)
|
||||
|
||||
if runningAsService {
|
||||
return svc.Run
|
||||
}
|
||||
|
||||
return debug.Run
|
||||
}
|
||||
|
||||
type npdService struct {
|
||||
sync.Mutex
|
||||
options *options.NodeProblemDetectorOptions
|
||||
}
|
||||
|
||||
func (s *npdService) Execute(args []string, r <-chan svc.ChangeRequest, changes chan<- svc.Status) (bool, uint32) {
|
||||
changes <- svc.Status{State: svc.StartPending}
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
var appWG sync.WaitGroup
|
||||
var svcWG sync.WaitGroup
|
||||
|
||||
options := s.options
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
|
||||
// NPD application goroutine.
|
||||
appWG.Add(1)
|
||||
go func() {
|
||||
defer appWG.Done()
|
||||
|
||||
if err := npdMain(ctx, options); err != nil {
|
||||
elog.Warning(windowsEventLogID, err.Error())
|
||||
}
|
||||
|
||||
changes <- svc.Status{State: svc.StopPending}
|
||||
}()
|
||||
|
||||
// Windows service control goroutine.
|
||||
svcWG.Add(1)
|
||||
go func() {
|
||||
defer svcWG.Done()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case c := <-r:
|
||||
switch c.Cmd {
|
||||
case svc.Interrogate:
|
||||
changes <- c.CurrentStatus
|
||||
// Testing deadlock from https://code.google.com/p/winsvc/issues/detail?id=4
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
changes <- c.CurrentStatus
|
||||
case svc.Stop, svc.Shutdown:
|
||||
elog.Info(windowsEventLogID, fmt.Sprintf("Stopping %s service, %v", svcName, c.Context))
|
||||
cancelFunc()
|
||||
case svc.Pause:
|
||||
elog.Info(windowsEventLogID, "ignoring pause command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Paused, Accepts: svcCommandsAccepted}
|
||||
case svc.Continue:
|
||||
elog.Info(windowsEventLogID, "ignoring continue command from Windows service control, not supported")
|
||||
changes <- svc.Status{State: svc.Running, Accepts: svcCommandsAccepted}
|
||||
default:
|
||||
elog.Error(windowsEventLogID, fmt.Sprintf("unexpected control request #%d", c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for the application go routine to die.
|
||||
appWG.Wait()
|
||||
|
||||
// Wait for the service control loop to terminate.
|
||||
// Otherwise it's possible that the channel closures cause the application to panic.
|
||||
svcWG.Wait()
|
||||
|
||||
// Send a signal to the Windows service control that the application has stopped.
|
||||
changes <- svc.Status{State: svc.Stopped, Accepts: svcCommandsAccepted}
|
||||
|
||||
return false, uint32(0)
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"golang.org/x/sys/windows/svc"
|
||||
)
|
||||
|
||||
func TestWindowsServiceLoop(t *testing.T) {
|
||||
npdo, cleanup := setupNPD(t)
|
||||
defer cleanup()
|
||||
|
||||
setupLogging(false)
|
||||
|
||||
s := &npdService{
|
||||
options: npdo,
|
||||
}
|
||||
|
||||
r := make(chan svc.ChangeRequest, 2)
|
||||
changes := make(chan svc.Status, 4)
|
||||
defer func() {
|
||||
close(r)
|
||||
close(changes)
|
||||
}()
|
||||
|
||||
r <- svc.ChangeRequest{
|
||||
Cmd: svc.Shutdown,
|
||||
}
|
||||
r <- svc.ChangeRequest{
|
||||
Cmd: svc.Shutdown,
|
||||
}
|
||||
|
||||
ssec, errno := s.Execute([]string{}, r, changes)
|
||||
if ssec != false {
|
||||
t.Error("ssec should be false")
|
||||
}
|
||||
if errno != 0 {
|
||||
t.Error("errno should be 0")
|
||||
}
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_custom_plugin_monitor
|
||||
// +build !disable_custom_plugin_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_log_monitor
|
||||
// +build !disable_system_log_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_system_stats_monitor
|
||||
// +build !disable_system_stats_monitor
|
||||
|
||||
/*
|
||||
|
|
|
@ -43,6 +43,10 @@ type NodeProblemDetectorOptions struct {
|
|||
ServerPort int
|
||||
// ServerAddress is the address to bind the node problem detector server.
|
||||
ServerAddress string
|
||||
// QPS is the maximum QPS to the master from client.
|
||||
QPS float32
|
||||
// Burst is the maximum burst for throttle.
|
||||
Burst int
|
||||
|
||||
// exporter options
|
||||
|
||||
|
@ -61,6 +65,10 @@ type NodeProblemDetectorOptions struct {
|
|||
APIServerWaitInterval time.Duration
|
||||
// K8sExporterHeartbeatPeriod is the period at which the k8s exporter does forcibly sync with apiserver.
|
||||
K8sExporterHeartbeatPeriod time.Duration
|
||||
// K8sExporterWriteEvents determines whether to write Kubernetes Events for problems.
|
||||
K8sExporterWriteEvents bool
|
||||
// K8sExporterUpdateNodeConditions determines whether to update Kubernetes Node Conditions for problems.
|
||||
K8sExporterUpdateNodeConditions bool
|
||||
|
||||
// prometheusExporter options
|
||||
// PrometheusServerPort is the port to bind the Prometheus scrape endpoint. Use 0 to disable.
|
||||
|
@ -113,6 +121,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.")
|
||||
fs.DurationVar(&npdo.K8sExporterHeartbeatPeriod, "k8s-exporter-heartbeat-period", 5*time.Minute, "The period at which k8s-exporter does forcibly sync with apiserver.")
|
||||
fs.BoolVar(&npdo.K8sExporterWriteEvents, "k8s-exporter-write-events", true, "Whether to write Kubernetes Event objects with event details.")
|
||||
fs.BoolVar(&npdo.K8sExporterUpdateNodeConditions, "k8s-exporter-update-node-conditions", true, "Whether to update Kubernetes Node conditions with event details.")
|
||||
fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit")
|
||||
fs.StringVar(&npdo.HostnameOverride, "hostname-override",
|
||||
"", "Custom node name used to override hostname")
|
||||
|
@ -125,7 +135,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) {
|
|||
20257, "The port to bind the Prometheus scrape endpoint. Prometheus exporter is enabled by default at port 20257. Use 0 to disable.")
|
||||
fs.StringVar(&npdo.PrometheusServerAddress, "prometheus-address",
|
||||
"127.0.0.1", "The address to bind the Prometheus scrape endpoint.")
|
||||
|
||||
fs.Float32Var(&npdo.QPS, "kube-api-qps", 500, "Maximum QPS to use while talking with Kubernetes API")
|
||||
fs.IntVar(&npdo.Burst, "kube-api-burst", 500, "Maximum burst for throttle while talking with Kubernetes API")
|
||||
for _, exporterName := range exporters.GetExporterNames() {
|
||||
exporterHandler := exporters.GetExporterHandlerOrDie(exporterName)
|
||||
exporterHandler.Options.SetFlags(fs)
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "Kerneloops",
|
||||
"reason": "KernelOops",
|
||||
"pattern": "System encountered a non-fatal error in \\S+"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"plugin": "filelog",
|
||||
"pluginConfig": {
|
||||
"timestamp": "^.{15}",
|
||||
"message": "(?i)Currently unreadable.*sectors|(?i)Offline uncorrectable sectors",
|
||||
"timestampFormat": "Jan _2 15:04:05"
|
||||
},
|
||||
"logPath": "/var/log/messages",
|
||||
"lookback": "10h",
|
||||
"bufferSize": 1,
|
||||
"source": "disk-monitor",
|
||||
"skipList": [ " audit:", " audit[" ],
|
||||
"conditions": [
|
||||
{
|
||||
"type": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"message": "Disk no bad block"
|
||||
},
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "DiskBadBlock",
|
||||
"reason": "DiskBadBlock",
|
||||
"pattern": ".*([1-9]\\d{2,}) (Currently unreadable.*sectors|Offline uncorrectable sectors).*"
|
||||
},
|
||||
]
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
[
|
||||
{ "moduleName": "xt_MASQUERADE"},
|
||||
{ "moduleName": "xt_addrtype"},
|
||||
{ "moduleName": "iptable_nat"},
|
||||
{ "moduleName": "nf_nat"},
|
||||
{ "moduleName": "br_netfilter"},
|
||||
{ "moduleName": "ip6table_filter"},
|
||||
{ "moduleName": "ip6_tables"},
|
||||
{ "moduleName": "aesni_intel"},
|
||||
{ "moduleName": "glue_helper"},
|
||||
{ "moduleName": "crypto_simd"},
|
||||
{ "moduleName": "cryptd"},
|
||||
{ "moduleName": "virtio_balloon"},
|
||||
{ "moduleName": "loadpin_trigger"},
|
||||
{ "moduleName":"ip6table_filter"},
|
||||
{ "moduleName":"ip6_tables"},
|
||||
{ "moduleName":"iptable_filter"},
|
||||
{ "moduleName":"bpfilter"},
|
||||
{ "moduleName":"nls_iso8859_1"},
|
||||
{ "moduleName":"intel_rapl_msr"},
|
||||
{ "moduleName":"intel_rapl_common"},
|
||||
{ "moduleName":"sb_edac"},
|
||||
{ "moduleName":"rapl"},
|
||||
{ "moduleName":"input_leds"},
|
||||
{ "moduleName":"serio_raw"},
|
||||
{ "moduleName":"pvpanic"},
|
||||
{ "moduleName":"mac_hid"},
|
||||
{ "moduleName":"sch_fq_codel"},
|
||||
{ "moduleName":"ib_iser"},
|
||||
{ "moduleName":"rdma_cm"},
|
||||
{ "moduleName":"iw_cm"},
|
||||
{ "moduleName":"ib_cm"},
|
||||
{ "moduleName":"ib_core"},
|
||||
{ "moduleName":"iscsi_tcp"},
|
||||
{ "moduleName":"libiscsi_tcp"},
|
||||
{ "moduleName":"libiscsi"},
|
||||
{ "moduleName":"scsi_transport_iscsi"},
|
||||
{ "moduleName":"virtio_rng"},
|
||||
{ "moduleName":"ip_tables"},
|
||||
{ "moduleName":"x_tables"},
|
||||
{ "moduleName":"autofs4"},
|
||||
{ "moduleName":"btrfs"},
|
||||
{ "moduleName":"zstd_compress"},
|
||||
{ "moduleName":"raid10"},
|
||||
{ "moduleName":"raid456"},
|
||||
{ "moduleName":"async_raid6_recov"},
|
||||
{ "moduleName":"async_memcpy"},
|
||||
{ "moduleName":"async_pq"},
|
||||
{ "moduleName":"async_xor"},
|
||||
{ "moduleName":"async_tx"},
|
||||
{ "moduleName":"xor"},
|
||||
{ "moduleName":"raid6_pq"},
|
||||
{ "moduleName":"raid1"},
|
||||
{ "moduleName":"raid0"},
|
||||
{ "moduleName":"multipath"},
|
||||
{ "moduleName":"linear"},
|
||||
{ "moduleName":"crct10dif_pclmul"},
|
||||
{ "moduleName":"crc32_pclmul"},
|
||||
{ "moduleName":"ghash_clmulni_intel"},
|
||||
{ "moduleName":"aesni_intel"},
|
||||
{ "moduleName":"crypto_simd"},
|
||||
{ "moduleName":"cryptd"},
|
||||
{ "moduleName":"glue_helper"},
|
||||
{ "moduleName":"psmouse"},
|
||||
{ "moduleName":"virtio_net"},
|
||||
{ "moduleName":"net_failover"},
|
||||
{ "moduleName": "failover"},
|
||||
{ "moduleName":"i2c_piix4"}
|
||||
]
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerRuntimeIsHealthy",
|
||||
"message": "Container runtime on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerdUnhealthy",
|
||||
"path": "/home/kubernetes/bin/health-checker",
|
||||
"args": [
|
||||
"--component=cri",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=2m",
|
||||
"--health-check-timeout=60s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -25,6 +25,7 @@
|
|||
"--component=kubelet",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=1m",
|
||||
"--loopback-time=0",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "86400s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "iptables-mode-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "IPTablesVersionsMismatch",
|
||||
"path": "./config/plugin/iptables_mode.sh",
|
||||
"timeout": "5s"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -20,7 +20,7 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "OOMKilling",
|
||||
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
||||
"pattern": "Killed process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
|
@ -42,12 +42,6 @@
|
|||
"reason": "KernelOops",
|
||||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
|
|
|
@ -12,21 +12,26 @@
|
|||
"message": "kernel has no deadlock"
|
||||
},
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
"type": "XfsShutdown",
|
||||
"reason": "XfsHasNotShutDown",
|
||||
"message": "XFS has not shutdown"
|
||||
},
|
||||
{
|
||||
"type": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareHasNoFatalError",
|
||||
"message": "UEFI CPER has no fatal error"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "OOMKilling",
|
||||
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
||||
"pattern": "Killed process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "TaskHung",
|
||||
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
|
||||
"pattern": "task [\\S ]+:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
|
@ -53,23 +58,43 @@
|
|||
"reason": "Ext4Warning",
|
||||
"pattern": "EXT4-fs warning .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "IOError",
|
||||
"pattern": "Buffer I/O error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
"condition": "XfsShutdown",
|
||||
"reason": "XfsHasShutdown",
|
||||
"pattern": "XFS .* Shutting down filesystem.?"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorCorrected",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: corrected$"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CperHardwareErrorRecoverable",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: recoverable$"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "CperHardwareErrorFatal",
|
||||
"reason": "CperHardwareErrorFatal",
|
||||
"pattern": ".*\\[Hardware Error\\]: event severity: fatal$"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "DockerHung",
|
||||
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
{
|
||||
"net": {
|
||||
"excludeInterfaceRegexp": "^(cali|tunl|veth)",
|
||||
"metricsConfigs": {
|
||||
"net/rx_bytes": {
|
||||
"displayName": "net/rx_bytes"
|
||||
},
|
||||
"net/rx_packets": {
|
||||
"displayName": "net/rx_packets"
|
||||
},
|
||||
"net/rx_errors": {
|
||||
"displayName": "net/rx_errors"
|
||||
},
|
||||
"net/rx_dropped": {
|
||||
"displayName": "net/rx_dropped"
|
||||
},
|
||||
"net/rx_fifo": {
|
||||
"displayName": "net/rx_fifo"
|
||||
},
|
||||
"net/rx_frame": {
|
||||
"displayName": "net/rx_frame"
|
||||
},
|
||||
"net/rx_compressed": {
|
||||
"displayName": "net/rx_compressed"
|
||||
},
|
||||
"net/rx_multicast": {
|
||||
"displayName": "net/rx_multicast"
|
||||
},
|
||||
"net/tx_bytes": {
|
||||
"displayName": "net/tx_bytes"
|
||||
},
|
||||
"net/tx_packets": {
|
||||
"displayName": "net/tx_packets"
|
||||
},
|
||||
"net/tx_errors": {
|
||||
"displayName": "net/tx_errors"
|
||||
},
|
||||
"net/tx_dropped": {
|
||||
"displayName": "net/tx_dropped"
|
||||
},
|
||||
"net/tx_fifo": {
|
||||
"displayName": "net/tx_fifo"
|
||||
},
|
||||
"net/tx_collisions": {
|
||||
"displayName": "net/tx_collisions"
|
||||
},
|
||||
"net/tx_carrier": {
|
||||
"displayName": "net/tx_carrier"
|
||||
},
|
||||
"net/tx_compressed": {
|
||||
"displayName": "net/tx_compressed"
|
||||
}
|
||||
}
|
||||
},
|
||||
"invokeInterval": "120s"
|
||||
}
|
|
@ -20,8 +20,7 @@ if systemctl -q is-active "$SERVICE"; then
|
|||
echo "$SERVICE is running"
|
||||
exit $OK
|
||||
else
|
||||
# Does not differenciate stopped/failed service from non-existent
|
||||
# Does not differentiate stopped/failed service from non-existent
|
||||
echo "$SERVICE is not running"
|
||||
exit $NONOK
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# As of iptables 1.8, the iptables command line clients come in two different versions/modes: "legacy",
|
||||
# which uses the kernel iptables API just like iptables 1.6 and earlier did, and "nft", which translates
|
||||
# the iptables command-line API into the kernel nftables API.
|
||||
# Because they connect to two different subsystems in the kernel, you cannot mix rules from different versions.
|
||||
# Ref: https://github.com/kubernetes-sigs/iptables-wrappers
|
||||
|
||||
readonly OK=0
|
||||
readonly NONOK=1
|
||||
readonly UNKNOWN=2
|
||||
|
||||
# based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
|
||||
readonly num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -c '^-' || true)
|
||||
readonly num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -c '^-' || true)
|
||||
|
||||
|
||||
if [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Found rules from both versions, iptables-legacy: ${num_legacy_lines} iptables-nft: ${num_nft_lines}"
|
||||
echo $NONOK
|
||||
elif [ "$num_legacy_lines" -gt 0 ] && [ "$num_nft_lines" -eq 0 ]; then
|
||||
echo "Using iptables-legacy: ${num_legacy_lines} rules"
|
||||
echo $OK
|
||||
elif [ "$num_legacy_lines" -eq 0 ] && [ "$num_nft_lines" -gt 0 ]; then
|
||||
echo "Using iptables-nft: ${num_nft_lines} rules"
|
||||
echo $OK
|
||||
else
|
||||
echo "No iptables rules found"
|
||||
echo $UNKNOWN
|
||||
fi
|
|
@ -0,0 +1,10 @@
|
|||
# This plugin checks to see if windows defender detects any threats to the node.
|
||||
|
||||
$windowsDefenderThreats = (Get-MpThreat | Where-Object {$_.IsActive -or $_.DidThreatExecute})
|
||||
|
||||
if ($windowsDefenderThreats.length -ne 0) {
|
||||
Write-Host $windowsDefenderThreats
|
||||
exit 1
|
||||
} else {
|
||||
exit 0
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,70 +1,108 @@
|
|||
{
|
||||
"cpu": {
|
||||
"metricsConfigs": {
|
||||
"cpu/runnable_task_count": {
|
||||
"displayName": "cpu/runnable_task_count"
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
}
|
||||
}
|
||||
},
|
||||
"disk": {
|
||||
"metricsConfigs": {
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
"disk/weighted_io": {
|
||||
"displayName": "disk/weighted_io"
|
||||
},
|
||||
"disk/avg_queue_len": {
|
||||
"displayName": "disk/avg_queue_len"
|
||||
},
|
||||
"disk/operation_count": {
|
||||
"displayName": "disk/operation_count"
|
||||
},
|
||||
"disk/merged_operation_count": {
|
||||
"displayName": "disk/merged_operation_count"
|
||||
},
|
||||
"disk/operation_bytes_count": {
|
||||
"displayName": "disk/operation_bytes_count"
|
||||
},
|
||||
"disk/operation_time": {
|
||||
"displayName": "disk/operation_time"
|
||||
},
|
||||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
}
|
||||
},
|
||||
"includeRootBlk": true,
|
||||
"includeAllAttachedBlk": true,
|
||||
"lsblkTimeout": "5s"
|
||||
},
|
||||
"host": {
|
||||
"metricsConfigs": {
|
||||
"host/uptime": {
|
||||
"displayName": "host/uptime"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memory": {
|
||||
"metricsConfigs": {
|
||||
"memory/bytes_used": {
|
||||
"displayName": "memory/bytes_used"
|
||||
},
|
||||
"memory/anonymous_used": {
|
||||
"displayName": "memory/anonymous_used"
|
||||
},
|
||||
"memory/page_cache_used": {
|
||||
"displayName": "memory/page_cache_used"
|
||||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/dirty_used": {
|
||||
"displayName": "memory/dirty_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
"invokeInterval": "60s"
|
||||
"cpu": {
|
||||
"metricsConfigs": {
|
||||
"cpu/load_15m": {
|
||||
"displayName": "cpu/load_15m"
|
||||
},
|
||||
"cpu/load_1m": {
|
||||
"displayName": "cpu/load_1m"
|
||||
},
|
||||
"cpu/load_5m": {
|
||||
"displayName": "cpu/load_5m"
|
||||
},
|
||||
"cpu/runnable_task_count": {
|
||||
"displayName": "cpu/runnable_task_count"
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
},
|
||||
"system/cpu_stat": {
|
||||
"displayName": "system/cpu_stat"
|
||||
},
|
||||
"system/interrupts_total": {
|
||||
"displayName": "system/interrupts_total"
|
||||
},
|
||||
"system/processes_total": {
|
||||
"displayName": "system/processes_total"
|
||||
},
|
||||
"system/procs_blocked": {
|
||||
"displayName": "system/procs_blocked"
|
||||
},
|
||||
"system/procs_running": {
|
||||
"displayName": "system/procs_running"
|
||||
}
|
||||
}
|
||||
},
|
||||
"disk": {
|
||||
"includeAllAttachedBlk": true,
|
||||
"includeRootBlk": true,
|
||||
"lsblkTimeout": "5s",
|
||||
"metricsConfigs": {
|
||||
"disk/avg_queue_len": {
|
||||
"displayName": "disk/avg_queue_len"
|
||||
},
|
||||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
"disk/merged_operation_count": {
|
||||
"displayName": "disk/merged_operation_count"
|
||||
},
|
||||
"disk/operation_bytes_count": {
|
||||
"displayName": "disk/operation_bytes_count"
|
||||
},
|
||||
"disk/operation_count": {
|
||||
"displayName": "disk/operation_count"
|
||||
},
|
||||
"disk/operation_time": {
|
||||
"displayName": "disk/operation_time"
|
||||
},
|
||||
"disk/weighted_io": {
|
||||
"displayName": "disk/weighted_io"
|
||||
}
|
||||
}
|
||||
},
|
||||
"host": {
|
||||
"metricsConfigs": {
|
||||
"host/uptime": {
|
||||
"displayName": "host/uptime"
|
||||
}
|
||||
}
|
||||
},
|
||||
"invokeInterval": "60s",
|
||||
"memory": {
|
||||
"metricsConfigs": {
|
||||
"memory/anonymous_used": {
|
||||
"displayName": "memory/anonymous_used"
|
||||
},
|
||||
"memory/bytes_used": {
|
||||
"displayName": "memory/bytes_used"
|
||||
},
|
||||
"memory/dirty_used": {
|
||||
"displayName": "memory/dirty_used"
|
||||
},
|
||||
"memory/page_cache_used": {
|
||||
"displayName": "memory/page_cache_used"
|
||||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
},
|
||||
"osFeature": {
|
||||
"KnownModulesConfigPath": "guestosconfig/known-modules.json",
|
||||
"metricsConfigs": {
|
||||
"system/os_feature": {
|
||||
"displayName": "system/os_feature"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,7 +37,8 @@
|
|||
"--lookback=20m",
|
||||
"--delay=5m",
|
||||
"--count=5",
|
||||
"--pattern=Started Kubernetes kubelet."
|
||||
"--pattern=Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet).",
|
||||
"--revert-pattern=Stopping (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -51,7 +52,8 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting Docker Application Container Engine..."
|
||||
"--pattern=Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)...",
|
||||
"--revert-pattern=Stopping (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
},
|
||||
|
@ -65,7 +67,8 @@
|
|||
"--log-path=/var/log/journal",
|
||||
"--lookback=20m",
|
||||
"--count=5",
|
||||
"--pattern=Starting containerd container runtime..."
|
||||
"--pattern=Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)...",
|
||||
"--revert-pattern=Stopping (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
],
|
||||
"timeout": "1m"
|
||||
}
|
||||
|
|
|
@ -13,17 +13,17 @@
|
|||
{
|
||||
"type": "temporary",
|
||||
"reason": "KubeletStart",
|
||||
"pattern": "Started Kubernetes kubelet."
|
||||
"pattern": "Started (Kubernetes kubelet|kubelet.service|kubelet.service - Kubernetes kubelet)."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "DockerStart",
|
||||
"pattern": "Starting Docker Application Container Engine..."
|
||||
"pattern": "Starting (Docker Application Container Engine|docker.service|docker.service - Docker Application Container Engine)..."
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "ContainerdStart",
|
||||
"pattern": "Starting containerd container runtime..."
|
||||
"pattern": "Starting (containerd container runtime|containerd.service|containerd.service - containerd container runtime)..."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -8,9 +8,9 @@ Restart=always
|
|||
RestartSec=10
|
||||
ExecStart=/home/kubernetes/bin/node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false \
|
||||
--exporter.stackdriver=/home/kubernetes/node-problem-detector/config/exporter/stackdriver-exporter.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/readonly-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \
|
||||
--config.custom-plugin-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor-counter.json,/home/kubernetes/node-problem-detector/config/systemd-monitor-counter.json \
|
||||
--config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json
|
||||
--config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json,/home/kubernetes/node-problem-detector/config/net-cgroup-system-stats-monitor.json
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"plugin": "filelog",
|
||||
"pluginConfig": {
|
||||
"timestamp": "^time=\"(\\S*)\"",
|
||||
"message": "msg=\"?([^\n]*)\"?",
|
||||
"timestampFormat": "2006-01-02T15:04:05.999999999Z"
|
||||
},
|
||||
"logPath": "C:\\etc\\kubernetes\\logs\\containerd.log",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "containerd",
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "ContainerCreationFailed",
|
||||
"pattern": ".*failed to create containerd container.*error unpacking image.*wrong diff id calculated on extraction.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "CorruptContainerImageLayer",
|
||||
"pattern": ".*failed to pull and unpack image.*failed to extract layer.*archive/tar: invalid tar header.*"
|
||||
},
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "HCSEmptyLayerchain",
|
||||
"pattern": ".*Failed to unmarshall layerchain json - invalid character '\\x00' looking for beginning of value*"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10m",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 3
|
||||
},
|
||||
"source": "windows-defender-custom-plugin-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary",
|
||||
"reason": "WindowsDefenderThreatsDetected",
|
||||
"path": "C:\\etc\\kubernetes\\node-problem-detector\\config\\plugin\\windows_defender_problem.ps1",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerRuntimeIsHealthy",
|
||||
"message": "Container runtime on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerdUnhealthy",
|
||||
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
|
||||
"args": [
|
||||
"--component=cri",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=2m",
|
||||
"--health-check-timeout=60s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ContainerRuntimeUnhealthy",
|
||||
"reason": "ContainerRuntimeIsHealthy",
|
||||
"message": "Container runtime on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ContainerRuntimeUnhealthy",
|
||||
"reason": "DockerUnhealthy",
|
||||
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
|
||||
"args": [
|
||||
"--component=docker",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=2m",
|
||||
"--health-check-timeout=60s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "KubeletUnhealthy",
|
||||
"reason": "KubeletIsHealthy",
|
||||
"message": "kubelet on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KubeletUnhealthy",
|
||||
"reason": "KubeletUnhealthy",
|
||||
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
|
||||
"args": [
|
||||
"--component=kubelet",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=1m",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "10s",
|
||||
"timeout": "3m",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 1
|
||||
},
|
||||
"source": "health-checker",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "KubeProxyUnhealthy",
|
||||
"reason": "KubeProxyIsHealthy",
|
||||
"message": "kube-proxy on the node is functioning properly"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KubeProxyUnhealthy",
|
||||
"reason": "KubeProxyUnhealthy",
|
||||
"path": "C:\\etc\\kubernetes\\node\\bin\\health-checker.exe",
|
||||
"args": [
|
||||
"--component=kube-proxy",
|
||||
"--enable-repair=true",
|
||||
"--cooldown-time=1m",
|
||||
"--health-check-timeout=10s"
|
||||
],
|
||||
"timeout": "3m"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
{
|
||||
"cpu": {
|
||||
"metricsConfigs": {
|
||||
"cpu/load_15m": {
|
||||
"displayName": "cpu/load_15m"
|
||||
},
|
||||
"cpu/load_1m": {
|
||||
"displayName": "cpu/load_1m"
|
||||
},
|
||||
"cpu/load_5m": {
|
||||
"displayName": "cpu/load_5m"
|
||||
},
|
||||
"cpu/runnable_task_count": {
|
||||
"displayName": "cpu/runnable_task_count"
|
||||
},
|
||||
"cpu/usage_time": {
|
||||
"displayName": "cpu/usage_time"
|
||||
},
|
||||
"system/cpu_stat": {
|
||||
"displayName": "system/cpu_stat"
|
||||
},
|
||||
"system/interrupts_total": {
|
||||
"displayName": "system/interrupts_total"
|
||||
},
|
||||
"system/processes_total": {
|
||||
"displayName": "system/processes_total"
|
||||
},
|
||||
"system/procs_blocked": {
|
||||
"displayName": "system/procs_blocked"
|
||||
},
|
||||
"system/procs_running": {
|
||||
"displayName": "system/procs_running"
|
||||
}
|
||||
}
|
||||
},
|
||||
"disk": {
|
||||
"includeAllAttachedBlk": false,
|
||||
"includeRootBlk": false,
|
||||
"lsblkTimeout": "60s",
|
||||
"metricsConfigs": {
|
||||
"disk/avg_queue_len": {
|
||||
"displayName": "disk/avg_queue_len"
|
||||
},
|
||||
"disk/bytes_used": {
|
||||
"displayName": "disk/bytes_used"
|
||||
},
|
||||
"disk/percent_used": {
|
||||
"displayName": "disk/percent_used"
|
||||
},
|
||||
"disk/io_time": {
|
||||
"displayName": "disk/io_time"
|
||||
},
|
||||
"disk/merged_operation_count": {
|
||||
"displayName": "disk/merged_operation_count"
|
||||
},
|
||||
"disk/operation_bytes_count": {
|
||||
"displayName": "disk/operation_bytes_count"
|
||||
},
|
||||
"disk/operation_count": {
|
||||
"displayName": "disk/operation_count"
|
||||
},
|
||||
"disk/operation_time": {
|
||||
"displayName": "disk/operation_time"
|
||||
},
|
||||
"disk/weighted_io": {
|
||||
"displayName": "disk/weighted_io"
|
||||
}
|
||||
}
|
||||
},
|
||||
"host": {
|
||||
"metricsConfigs": {
|
||||
"host/uptime": {
|
||||
"displayName": "host/uptime"
|
||||
}
|
||||
}
|
||||
},
|
||||
"invokeInterval": "60s",
|
||||
"memory": {
|
||||
"metricsConfigs": {
|
||||
"memory/anonymous_used": {
|
||||
"displayName": "memory/anonymous_used"
|
||||
},
|
||||
"memory/bytes_used": {
|
||||
"displayName": "memory/bytes_used"
|
||||
},
|
||||
"memory/dirty_used": {
|
||||
"displayName": "memory/dirty_used"
|
||||
},
|
||||
"memory/page_cache_used": {
|
||||
"displayName": "memory/page_cache_used"
|
||||
},
|
||||
"memory/unevictable_used": {
|
||||
"displayName": "memory/unevictable_used"
|
||||
},
|
||||
"memory/percent_used": {
|
||||
"displayName": "memory/percent_used"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -46,10 +46,9 @@ data:
|
|||
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "KernelDeadlock",
|
||||
"reason": "AUFSUmountHung",
|
||||
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
|
||||
"type": "temporary",
|
||||
"reason": "MemoryReadError",
|
||||
"pattern": "CE memory read error .*"
|
||||
},
|
||||
{
|
||||
"type": "permanent",
|
||||
|
@ -65,6 +64,30 @@ data:
|
|||
}
|
||||
]
|
||||
}
|
||||
readonly-monitor.json: |
|
||||
{
|
||||
"plugin": "kmsg",
|
||||
"logPath": "/dev/kmsg",
|
||||
"lookback": "5m",
|
||||
"bufferSize": 10,
|
||||
"source": "readonly-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsNotReadOnly",
|
||||
"message": "Filesystem is not read-only"
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "permanent",
|
||||
"condition": "ReadonlyFilesystem",
|
||||
"reason": "FilesystemIsReadOnly",
|
||||
"pattern": "Remounting filesystem read-only"
|
||||
}
|
||||
]
|
||||
}
|
||||
docker-monitor.json: |
|
||||
{
|
||||
"plugin": "journald",
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-problem-detector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
- --config.custom-plugin-monitor=/config/health-checker-kubelet.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 80Mi
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
# Make sure node problem detector is in the same timezone
|
||||
# with the host.
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
- mountPath: /etc/machine-id
|
||||
name: machine-id
|
||||
readOnly: true
|
||||
- mountPath: /run/systemd/system
|
||||
name: systemd
|
||||
- mountPath: /var/run/dbus/
|
||||
name: dbus
|
||||
mountPropagation: Bidirectional
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
- name: config
|
||||
configMap:
|
||||
name: node-problem-detector-config
|
||||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
- name: machine-id
|
||||
hostPath:
|
||||
path: /etc/machine-id
|
||||
type: "File"
|
||||
- name: systemd
|
||||
hostPath:
|
||||
path: /run/systemd/system/
|
||||
type: ""
|
||||
- name: dbus
|
||||
hostPath:
|
||||
path: /var/run/dbus/
|
||||
type: ""
|
|
@ -14,13 +14,22 @@ spec:
|
|||
labels:
|
||||
app: node-problem-detector
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
command:
|
||||
- /node-problem-detector
|
||||
- --logtostderr
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json
|
||||
image: k8s.gcr.io/node-problem-detector:v0.8.1
|
||||
- --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json,/config/docker-monitor.json
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.19
|
||||
resources:
|
||||
limits:
|
||||
cpu: 10m
|
||||
|
@ -51,6 +60,7 @@ spec:
|
|||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
serviceAccountName: node-problem-detector
|
||||
volumes:
|
||||
- name: log
|
||||
# Config `log` to your system log directory
|
||||
|
@ -68,5 +78,12 @@ spec:
|
|||
items:
|
||||
- key: kernel-monitor.json
|
||||
path: kernel-monitor.json
|
||||
- key: readonly-monitor.json
|
||||
path: readonly-monitor.json
|
||||
- key: docker-monitor.json
|
||||
path: docker-monitor.json
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
|
@ -1,9 +1,62 @@
|
|||
# Custom Plugin Monitor
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin Config
|
||||
|
||||
* `invoke_interval`: Interval at which custom plugins will be invoked.
|
||||
* `timeout`: Time after which custom plugins invokation will be terminated and considered timeout.
|
||||
* `timeout`: Time after which custom plugins invocation will be terminated and considered timeout.
|
||||
* `max_output_length`: The maximum standard output size from custom plugins that NPD will be cut and use for condition status message.
|
||||
* `concurrency`: The plugin worker number, i.e., how many custom plugins will be invoked concurrently.
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
||||
* `enable_message_change_based_condition_update`: Flag controls whether message change should result in a condition update.
|
||||
* `skip_initial_status`: Flag controls whether condition will be emitted during plugin initialization.
|
||||
|
||||
### Annotated Plugin Configuration Example
|
||||
|
||||
```
|
||||
{
|
||||
"plugin": "custom",
|
||||
"pluginConfig": {
|
||||
"invoke_interval": "30s",
|
||||
"timeout": "5s",
|
||||
"max_output_length": 80,
|
||||
"concurrency": 3,
|
||||
"enable_message_change_based_condition_update": false
|
||||
},
|
||||
"source": "ntp-custom-plugin-monitor",
|
||||
"metricsReporting": true,
|
||||
"conditions": [
|
||||
{
|
||||
"type": "NTPProblem",
|
||||
"reason": "NTPIsUp", // This is the default reason shown when healthy
|
||||
"message": "ntp service is up" // This is the default message shown when healthy
|
||||
}
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"type": "temporary", // These are not shown unless there's an
|
||||
// event so they always relate to a problem.
|
||||
// There are no defaults since there is nothing
|
||||
// to show unless there's a problem.
|
||||
"reason": "NTPIsDown", // This is the reason shown for this event
|
||||
// and the message shown comes from stdout.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
},
|
||||
{
|
||||
"type": "permanent", // These are permanent and are shown in the Conditions section
|
||||
// when running `kubectl describe node ...`
|
||||
// They have default values shown above in the conditions section
|
||||
// and also a reason for each specific trigger listed in this rules section.
|
||||
// Message will come from default for healthy times
|
||||
// and during unhealthy time message comes from stdout of the check.
|
||||
|
||||
"condition": "NTPProblem", // This is the key to connect to the corresponding condition listed above
|
||||
"reason": "NTPIsDown", // and the reason shown for failures detected in this rule
|
||||
// and message will be from stdout of the check.
|
||||
"path": "./config/plugin/check_ntp.sh",
|
||||
"timeout": "3s"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
# Release Process
|
||||
|
||||
These are notes to help follow a consistent release process. See something
|
||||
important missing? Please submit a pull request to add anything else that would
|
||||
be useful!
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Ensure access to the container image [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
Add email to `k8s-infra-staging-npd` group in sig-node [groups.yaml](https://github.com/kubernetes/k8s.io/blob/main/groups/sig-node/groups.yaml).
|
||||
See example https://github.com/kubernetes/k8s.io/pull/1599.
|
||||
|
||||
## Preparing for a release
|
||||
|
||||
There are a few steps that should be taken prior to creating the actual release
|
||||
itself.
|
||||
|
||||
1. Collect changes since last release. This can be done by looking directly at
|
||||
merged commit messages (``git log [last_release_tag]...HEAD``), or by
|
||||
viewing the changes on GitHub (example: https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...master).
|
||||
|
||||
2. Based on the changes to be included in the release, determine what the next
|
||||
release number should be. We strive to follow [SemVer](https://semver.org/)
|
||||
as much as possible.
|
||||
|
||||
3. Update [CHANGELOG](https://github.com/kubernetes/node-problem-detector/blob/master/CHANGELOG.md)
|
||||
with all significant changes.
|
||||
|
||||
## Create release
|
||||
|
||||
### Create the new version tag
|
||||
|
||||
#### Option 1
|
||||
```
|
||||
# Use v0.8.17 as an example.
|
||||
git clone git@github.com:kubernetes/node-problem-detector.git
|
||||
cd node-problem-detector/
|
||||
git tag v0.8.17
|
||||
git push origin v0.8.17
|
||||
```
|
||||
|
||||
#### Option 2
|
||||
Update [version.txt](https://github.com/kubernetes/node-problem-detector/blob/master/version.txt)
|
||||
(example https://github.com/kubernetes/node-problem-detector/pull/869).
|
||||
|
||||
### Build and push artifacts
|
||||
This step builds the NPD into container files and tar files.
|
||||
- The container file is pushed to the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector).
|
||||
You will promote the new image to registry.k8s.io later.
|
||||
- The tar files are generated locally. You will upload those to github in the
|
||||
release note later.
|
||||
|
||||
**Note: You need the access mentioned in the [prerequisites](#prerequisites)
|
||||
section to perform steps in this section.**
|
||||
|
||||
```
|
||||
# One-time setup
|
||||
sudo apt-get install libsystemd-dev gcc-aarch64-linux-gnu
|
||||
|
||||
cd node-problem-detector
|
||||
make release
|
||||
|
||||
# Get SHA256 of the tar files. For example
|
||||
sha256sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
sha256sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Get MD5 of the tar files. For example
|
||||
md5sum node-problem-detector-v0.8.17-linux_amd64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-linux_arm64.tar.gz
|
||||
md5sum node-problem-detector-v0.8.17-windows_amd64.tar.gz
|
||||
|
||||
# Verify container image in staging registry and get SHA256.
|
||||
docker pull gcr.io/k8s-staging-npd/node-problem-detector:v0.8.17
|
||||
docker image ls gcr.io/k8s-staging-npd/node-problem-detector --digests
|
||||
```
|
||||
|
||||
### Promote new NPD image to registry.k8s.io
|
||||
1. Get the SHA256 from the new NPD image from the [staging registry](https://console.cloud.google.com/gcr/images/k8s-staging-npd/global/node-problem-detector)
|
||||
or previous step.
|
||||
2. Promote the NPD image to registry.k8s.io ([images.yaml](https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-npd/images.yaml), example https://github.com/kubernetes/k8s.io/pull/6523).
|
||||
3. Verify the container image.
|
||||
```
|
||||
docker pull registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
docker image ls registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17
|
||||
```
|
||||
|
||||
### Create the release note
|
||||
|
||||
Go to https://github.com/kubernetes/node-problem-detector/releases, draft a new
|
||||
release note and publish. Make sure to include the following in the body of the
|
||||
release note:
|
||||
|
||||
1. For convenience, add a link to easily view the changes since the last
|
||||
release (e.g.
|
||||
[https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17](https://github.com/kubernetes/node-problem-detector/compare/v0.8.15...v0.8.17)).
|
||||
|
||||
2. There is no need to duplicate everything from the CHANGELOG, but include the
|
||||
most significant things so someone just viewing the release entry will have
|
||||
an idea of what it includes.
|
||||
|
||||
3. Provide a link to the new image release (e.g. `Image:
|
||||
registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.17`)
|
||||
|
||||
4. Upload the tar files built from [pevious step](#build-and-push-artifacts),
|
||||
and include the SHA and MD5.
|
||||
|
||||
## Post release steps
|
||||
|
||||
1. Update image version in node-problem-detector repo, so anyone deploying
|
||||
directly from the repo deployment file will get the newest image deployed.
|
||||
Example https://github.com/kubernetes/node-problem-detector/pull/897.
|
||||
|
||||
2. Update the NPD version in [kubernetes/kubernetes](https://github.com/kubernetes/kubernetes)
|
||||
repo, so that kubernetes clusters use the new NPD version. Example
|
||||
https://github.com/kubernetes/kubernetes/pull/123740.
|
139
go.mod
139
go.mod
|
@ -1,41 +1,110 @@
|
|||
module k8s.io/node-problem-detector
|
||||
|
||||
go 1.11
|
||||
go 1.24.2
|
||||
|
||||
require (
|
||||
cloud.google.com/go v0.43.0
|
||||
code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.0.0-20190427222117-f6cda26f80a3
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.12.5
|
||||
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
|
||||
github.com/avast/retry-go v2.4.1+incompatible
|
||||
github.com/cobaugh/osrelease v0.0.0-20181218015638-a93a0a55a249
|
||||
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
|
||||
github.com/euank/go-kmsg-parser v2.0.1+incompatible
|
||||
github.com/go-ole/go-ole v1.2.4 // indirect
|
||||
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
|
||||
github.com/google/cadvisor v0.33.0
|
||||
github.com/onsi/ginkgo v1.8.0
|
||||
github.com/onsi/gomega v1.7.0
|
||||
github.com/pborman/uuid v1.2.0
|
||||
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90
|
||||
github.com/prometheus/common v0.4.1
|
||||
github.com/prometheus/procfs v0.0.8
|
||||
github.com/shirou/gopsutil v2.19.12+incompatible
|
||||
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
|
||||
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d // indirect
|
||||
github.com/spf13/pflag v1.0.3
|
||||
github.com/stretchr/testify v1.4.0
|
||||
github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc // indirect
|
||||
go.opencensus.io v0.22.0
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
|
||||
google.golang.org/api v0.7.0
|
||||
k8s.io/api v0.0.0-20190816222004-e3a6b8045b0b
|
||||
k8s.io/apimachinery v0.0.0-20190816221834-a9f1d8a9c101
|
||||
k8s.io/client-go v11.0.1-0.20190805182717-6502b5e7b1b5+incompatible
|
||||
k8s.io/heapster v0.0.0-20180704153620-b25f8a16208f
|
||||
k8s.io/kubernetes v1.14.6
|
||||
k8s.io/test-infra v0.0.0-20190914015041-e1cbc3ccd91c
|
||||
cloud.google.com/go/compute/metadata v0.6.0
|
||||
contrib.go.opencensus.io/exporter/prometheus v0.4.2
|
||||
contrib.go.opencensus.io/exporter/stackdriver v0.13.14
|
||||
github.com/acobaugh/osrelease v0.1.0
|
||||
github.com/avast/retry-go/v4 v4.6.1
|
||||
github.com/coreos/go-systemd/v22 v22.5.0
|
||||
github.com/euank/go-kmsg-parser v2.0.0+incompatible
|
||||
github.com/hpcloud/tail v1.0.0
|
||||
github.com/prometheus/client_model v0.6.2
|
||||
github.com/prometheus/common v0.63.0
|
||||
github.com/prometheus/procfs v0.16.1
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
github.com/spf13/pflag v1.0.6
|
||||
github.com/stretchr/testify v1.10.0
|
||||
go.opencensus.io v0.24.0
|
||||
golang.org/x/sys v0.32.0
|
||||
google.golang.org/api v0.230.0
|
||||
k8s.io/api v0.33.0
|
||||
k8s.io/apimachinery v0.33.0
|
||||
k8s.io/client-go v0.33.0
|
||||
k8s.io/klog/v2 v2.130.1
|
||||
k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e
|
||||
)
|
||||
|
||||
replace git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999
|
||||
require (
|
||||
cloud.google.com/go/auth v0.16.0 // indirect
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
|
||||
cloud.google.com/go/monitoring v1.20.3 // indirect
|
||||
cloud.google.com/go/trace v1.10.11 // indirect
|
||||
github.com/aws/aws-sdk-go v1.44.72 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.6.0 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
|
||||
github.com/go-kit/log v0.2.1 // indirect
|
||||
github.com/go-logfmt/logfmt v0.5.1 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/gnostic-models v0.6.9 // indirect
|
||||
github.com/google/go-cmp v0.7.0 // indirect
|
||||
github.com/google/s2a-go v0.1.9 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
|
||||
github.com/jmespath/go-jmespath v0.4.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.9 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/prometheus/client_golang v1.20.4 // indirect
|
||||
github.com/prometheus/prometheus v0.35.0 // indirect
|
||||
github.com/prometheus/statsd_exporter v0.22.7 // indirect
|
||||
github.com/shoenig/go-m1cpu v0.1.6 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||
go.opentelemetry.io/otel v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.35.0 // indirect
|
||||
golang.org/x/crypto v0.37.0 // indirect
|
||||
golang.org/x/net v0.39.0 // indirect
|
||||
golang.org/x/oauth2 v0.29.0 // indirect
|
||||
golang.org/x/sync v0.13.0 // indirect
|
||||
golang.org/x/term v0.31.0 // indirect
|
||||
golang.org/x/text v0.24.0 // indirect
|
||||
golang.org/x/time v0.11.0 // indirect
|
||||
google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect
|
||||
google.golang.org/grpc v1.72.0 // indirect
|
||||
google.golang.org/protobuf v1.36.6 // indirect
|
||||
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
|
||||
gopkg.in/fsnotify.v1 v1.4.7 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
|
||||
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
|
||||
sigs.k8s.io/randfill v1.0.0 // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
|
||||
sigs.k8s.io/yaml v1.4.0 // indirect
|
||||
)
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
VERSION="$1"
|
||||
|
||||
NPD_LINUX_AMD64=node-problem-detector-${VERSION}-linux_amd64.tar.gz
|
||||
NPD_LINUX_ARM64=node-problem-detector-${VERSION}-linux_arm64.tar.gz
|
||||
NPD_WINDOWS_AMD64=node-problem-detector-${VERSION}-windows_amd64.tar.gz
|
||||
|
||||
SHA_NPD_LINUX_AMD64=$(sha256sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
SHA_NPD_LINUX_ARM64=$(sha256sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
SHA_NPD_WINDOWS_AMD64=$(sha256sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
MD5_NPD_LINUX_AMD64=$(md5sum ${NPD_LINUX_AMD64} | cut -d' ' -f1)
|
||||
MD5_NPD_LINUX_ARM64=$(md5sum ${NPD_LINUX_ARM64} | cut -d' ' -f1)
|
||||
MD5_NPD_WINDOWS_AMD64=$(md5sum ${NPD_WINDOWS_AMD64} | cut -d' ' -f1)
|
||||
|
||||
echo
|
||||
echo **${NPD_LINUX_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_AMD64}
|
||||
echo
|
||||
echo **${NPD_LINUX_ARM64}**:
|
||||
echo **SHA**: ${SHA_NPD_LINUX_ARM64}
|
||||
echo **MD5**: ${MD5_NPD_LINUX_ARM64}
|
||||
echo
|
||||
echo **${NPD_WINDOWS_AMD64}**:
|
||||
echo **SHA**: ${SHA_NPD_WINDOWS_AMD64}
|
||||
echo **MD5**: ${MD5_NPD_WINDOWS_AMD64}
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash -xe
|
||||
|
||||
# Copyright 2023 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
VERSION=$(cat version.txt)
|
||||
|
||||
if [[ ! "${VERSION}" =~ ^v([0-9]+[.][0-9]+)[.]([0-9]+)(-(alpha|beta)[.]([0-9]+))?$ ]]; then
|
||||
echo "Version ${VERSION} must be 'X.Y.Z', 'X.Y.Z-alpha.N', or 'X.Y.Z-beta.N'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$(git tag -l "${VERSION}")" ]; then
|
||||
echo "Tag ${VERSION} already exists"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git tag -a -m "Release ${VERSION}" "${VERSION}"
|
||||
git push origin "${VERSION}"
|
||||
|
||||
echo "release_tag=refs/tags/${VERSION}" >> $GITHUB_OUTPUT
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2024 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
make gomod
|
||||
changes=$(git status --porcelain go.mod go.sum vendor/ tests/e2e/go.mod tests/e2e/go.sum || true)
|
||||
if [ -n "${changes}" ]; then
|
||||
echo "ERROR: go modules are not up to date; please run: make gomod"
|
||||
echo "changed files:"
|
||||
printf "%s" "${changes}\n"
|
||||
echo "git diff:"
|
||||
git --no-pager diff
|
||||
exit 1
|
||||
fi
|
|
@ -18,10 +18,10 @@ package custompluginmonitor
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/custompluginmonitor/plugin"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
|
@ -47,7 +47,6 @@ type customPluginMonitor struct {
|
|||
config cpmtypes.CustomPluginConfig
|
||||
conditions []types.Condition
|
||||
plugin *plugin.Plugin
|
||||
resultChan <-chan cpmtypes.Result
|
||||
statusChan chan *types.Status
|
||||
tomb *tomb.Tomb
|
||||
}
|
||||
|
@ -58,27 +57,27 @@ func NewCustomPluginMonitorOrDie(configPath string) types.Monitor {
|
|||
configPath: configPath,
|
||||
tomb: tomb.NewTomb(),
|
||||
}
|
||||
f, err := ioutil.ReadFile(configPath)
|
||||
f, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &c.config)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", configPath, err)
|
||||
}
|
||||
// Apply configurations
|
||||
err = (&c.config).ApplyConfiguration()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
klog.Fatalf("Failed to apply configuration for %q: %v", configPath, err)
|
||||
}
|
||||
|
||||
// Validate configurations
|
||||
err = c.config.Validate()
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
klog.Fatalf("Failed to validate custom plugin config %+v: %v", c.config, err)
|
||||
}
|
||||
|
||||
glog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
klog.Infof("Finish parsing custom plugin monitor config file %s: %+v", c.configPath, c.config)
|
||||
|
||||
c.plugin = plugin.NewPlugin(c.config)
|
||||
// A 1000 size channel should be big enough.
|
||||
|
@ -97,32 +96,39 @@ func initializeProblemMetricsOrDie(rules []*cpmtypes.CustomRule) {
|
|||
if rule.Type == types.Perm {
|
||||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(rule.Condition, rule.Reason, false)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Fatalf("Failed to initialize problem gauge metrics for problem %q, reason %q: %v",
|
||||
rule.Condition, rule.Reason, err)
|
||||
}
|
||||
}
|
||||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(rule.Reason, 0)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
klog.Fatalf("Failed to initialize problem counter metrics for %q: %v", rule.Reason, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Start() (<-chan *types.Status, error) {
|
||||
glog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
klog.Infof("Start custom plugin monitor %s", c.configPath)
|
||||
go c.plugin.Run()
|
||||
go c.monitorLoop()
|
||||
return c.statusChan, nil
|
||||
}
|
||||
|
||||
func (c *customPluginMonitor) Stop() {
|
||||
glog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
klog.Infof("Stop custom plugin monitor %s", c.configPath)
|
||||
c.tomb.Stop()
|
||||
}
|
||||
|
||||
// monitorLoop is the main loop of customPluginMonitor.
|
||||
// there is one customPluginMonitor, one plugin instance for each configPath.
|
||||
// each runs rules in parallel at pre-configured concurrency, and interval.
|
||||
func (c *customPluginMonitor) monitorLoop() {
|
||||
c.initializeStatus()
|
||||
c.initializeConditions()
|
||||
if *c.config.PluginGlobalConfig.SkipInitialStatus {
|
||||
klog.Infof("Skipping sending initial status. Using default conditions: %+v", c.conditions)
|
||||
} else {
|
||||
c.sendInitialStatus()
|
||||
}
|
||||
|
||||
resultChan := c.plugin.GetResultChan()
|
||||
|
||||
|
@ -130,16 +136,16 @@ func (c *customPluginMonitor) monitorLoop() {
|
|||
select {
|
||||
case result, ok := <-resultChan:
|
||||
if !ok {
|
||||
glog.Errorf("Result channel closed: %s", c.configPath)
|
||||
klog.Errorf("Result channel closed: %s", c.configPath)
|
||||
return
|
||||
}
|
||||
glog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
klog.V(3).Infof("Receive new plugin result for %s: %+v", c.configPath, result)
|
||||
status := c.generateStatus(result)
|
||||
glog.V(3).Infof("New status generated: %+v", status)
|
||||
klog.V(3).Infof("New status generated: %+v", status)
|
||||
c.statusChan <- status
|
||||
case <-c.tomb.Stopping():
|
||||
c.plugin.Stop()
|
||||
glog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
klog.Infof("Custom plugin monitor stopped: %s", c.configPath)
|
||||
c.tomb.Done()
|
||||
return
|
||||
}
|
||||
|
@ -188,9 +194,10 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
if condition.Status == types.True && status != types.True {
|
||||
// Scenario 1: Condition status changes from True to False/Unknown
|
||||
newReason = defaultConditionReason
|
||||
if newMessage == "" {
|
||||
if status == types.False {
|
||||
newMessage = defaultConditionMessage
|
||||
} else {
|
||||
// When status unknown, the result's message is important for debug
|
||||
newMessage = result.Message
|
||||
}
|
||||
} else if condition.Status != types.True && status == types.True {
|
||||
|
@ -200,9 +207,10 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
} else if condition.Status != status {
|
||||
// Scenario 3: Condition status changes from False to Unknown or vice versa
|
||||
newReason = defaultConditionReason
|
||||
if newMessage == "" {
|
||||
if status == types.False {
|
||||
newMessage = defaultConditionMessage
|
||||
} else {
|
||||
// When status unknown, the result's message is important for debug
|
||||
newMessage = result.Message
|
||||
}
|
||||
} else if condition.Status == types.True && status == types.True &&
|
||||
|
@ -230,6 +238,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
condition.Type,
|
||||
status,
|
||||
newReason,
|
||||
newMessage,
|
||||
timestamp,
|
||||
)
|
||||
|
||||
|
@ -250,7 +259,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.IncrementProblemCounter(
|
||||
event.Reason, 1)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
klog.Errorf("Failed to update problem counter metrics for %q: %v",
|
||||
event.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -258,7 +267,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
err := problemmetrics.GlobalProblemMetricsManager.SetProblemGauge(
|
||||
condition.Type, condition.Reason, condition.Status == types.True)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
klog.Errorf("Failed to update problem gauge metrics for problem %q, reason %q: %v",
|
||||
condition.Type, condition.Reason, err)
|
||||
}
|
||||
}
|
||||
|
@ -271,7 +280,7 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat
|
|||
}
|
||||
// Log only if condition has changed
|
||||
if len(activeProblemEvents) != 0 || len(inactiveProblemEvents) != 0 {
|
||||
glog.V(0).Infof("New status generated: %+v", status)
|
||||
klog.V(0).Infof("New status generated: %+v", status)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
@ -287,11 +296,9 @@ func toConditionStatus(s cpmtypes.Status) types.ConditionStatus {
|
|||
}
|
||||
}
|
||||
|
||||
// initializeStatus initializes the internal condition and also reports it to the node problem detector.
|
||||
func (c *customPluginMonitor) initializeStatus() {
|
||||
// Initialize the default node conditions
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
glog.Infof("Initialize condition generated: %+v", c.conditions)
|
||||
// sendInitialStatus sends the initial status to the node problem detector.
|
||||
func (c *customPluginMonitor) sendInitialStatus() {
|
||||
klog.Infof("Sending initial status for %s with conditions: %+v", c.config.Source, c.conditions)
|
||||
// Update the initial status
|
||||
c.statusChan <- &types.Status{
|
||||
Source: c.config.Source,
|
||||
|
@ -299,6 +306,12 @@ func (c *customPluginMonitor) initializeStatus() {
|
|||
}
|
||||
}
|
||||
|
||||
// initializeConditions initializes the internal node conditions.
|
||||
func (c *customPluginMonitor) initializeConditions() {
|
||||
c.conditions = initialConditions(c.config.DefaultConditions)
|
||||
klog.Infof("Initialized conditions for %s: %+v", c.configPath, c.conditions)
|
||||
}
|
||||
|
||||
func initialConditions(defaults []types.Condition) []types.Condition {
|
||||
conditions := make([]types.Condition, len(defaults))
|
||||
copy(conditions, defaults)
|
||||
|
|
|
@ -20,15 +20,15 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
cpmtypes "k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
"k8s.io/node-problem-detector/pkg/util/tomb"
|
||||
)
|
||||
|
||||
|
@ -60,7 +60,7 @@ func (p *Plugin) GetResultChan() <-chan cpmtypes.Result {
|
|||
|
||||
func (p *Plugin) Run() {
|
||||
defer func() {
|
||||
glog.Info("Stopping plugin execution")
|
||||
klog.Info("Stopping plugin execution")
|
||||
close(p.resultChan)
|
||||
p.tomb.Done()
|
||||
}()
|
||||
|
@ -89,9 +89,10 @@ func (p *Plugin) Run() {
|
|||
|
||||
// run each rule in parallel and wait for them to complete
|
||||
func (p *Plugin) runRules() {
|
||||
glog.V(3).Info("Start to run custom plugins")
|
||||
klog.V(3).Info("Start to run custom plugins")
|
||||
|
||||
for _, rule := range p.config.Rules {
|
||||
// syncChan limits concurrent goroutines to configured PluginGlobalConfig.Concurrency value
|
||||
p.syncChan <- struct{}{}
|
||||
p.Add(1)
|
||||
go func(rule *cpmtypes.CustomRule) {
|
||||
|
@ -102,8 +103,12 @@ func (p *Plugin) runRules() {
|
|||
|
||||
start := time.Now()
|
||||
exitStatus, message := p.run(*rule)
|
||||
level := klog.Level(3)
|
||||
if exitStatus != 0 {
|
||||
level = klog.Level(2)
|
||||
}
|
||||
|
||||
glog.V(3).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
klog.V(level).Infof("Rule: %+v. Start time: %v. End time: %v. Duration: %v", rule, start, time.Now(), time.Since(start))
|
||||
|
||||
result := cpmtypes.Result{
|
||||
Rule: rule,
|
||||
|
@ -111,26 +116,27 @@ func (p *Plugin) runRules() {
|
|||
Message: message,
|
||||
}
|
||||
|
||||
// pipes result into resultChan which customPluginMonitor instance generates status from
|
||||
p.resultChan <- result
|
||||
|
||||
// Let the result be logged at a higher verbosity level. If there is a change in status it is logged later.
|
||||
glog.V(3).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
klog.V(level).Infof("Add check result %+v for rule %+v", result, rule)
|
||||
}(rule)
|
||||
}
|
||||
|
||||
p.Wait()
|
||||
glog.V(3).Info("Finish running custom plugins")
|
||||
klog.V(3).Info("Finish running custom plugins")
|
||||
}
|
||||
|
||||
// readFromReader reads the maxBytes from the reader and drains the rest.
|
||||
func readFromReader(reader io.ReadCloser, maxBytes int64) ([]byte, error) {
|
||||
limitReader := io.LimitReader(reader, maxBytes)
|
||||
data, err := ioutil.ReadAll(limitReader)
|
||||
data, err := io.ReadAll(limitReader)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
// Drain the reader
|
||||
if _, err := io.Copy(ioutil.Discard, reader); err != nil {
|
||||
if _, err := io.Copy(io.Discard, reader); err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
return data, nil
|
||||
|
@ -147,23 +153,54 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
}
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, rule.Path, rule.Args...)
|
||||
cmd := util.Exec(rule.Path, rule.Args...)
|
||||
|
||||
stdoutPipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
glog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error creating stdout pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stdout pipe for plugin. Please check the error log"
|
||||
}
|
||||
stderrPipe, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
glog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error creating stderr pipe for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error creating stderr pipe for plugin. Please check the error log"
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
glog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error in starting plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error in starting plugin. Please check the error log"
|
||||
}
|
||||
|
||||
waitChan := make(chan struct{})
|
||||
defer close(waitChan)
|
||||
|
||||
var m sync.Mutex
|
||||
timeout := false
|
||||
|
||||
go func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if ctx.Err() == context.Canceled {
|
||||
return
|
||||
}
|
||||
klog.Errorf("Error in running plugin timeout %q", rule.Path)
|
||||
if cmd.Process == nil || cmd.Process.Pid == 0 {
|
||||
klog.Errorf("Error in cmd.Process check %q", rule.Path)
|
||||
break
|
||||
}
|
||||
|
||||
m.Lock()
|
||||
timeout = true
|
||||
m.Unlock()
|
||||
|
||||
err := util.Kill(cmd)
|
||||
if err != nil {
|
||||
klog.Errorf("Error in kill process %d, %v", cmd.Process.Pid, err)
|
||||
}
|
||||
case <-waitChan:
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
stdout []byte
|
||||
|
@ -174,30 +211,30 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
|
||||
wg.Add(2)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
stdout, stdoutErr = readFromReader(stdoutPipe, maxCustomPluginBufferBytes)
|
||||
wg.Done()
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
stderr, stderrErr = readFromReader(stderrPipe, maxCustomPluginBufferBytes)
|
||||
wg.Done()
|
||||
}()
|
||||
// This will wait for the reads to complete. If the execution times out, the pipes
|
||||
// will be closed and the wait group unblocks.
|
||||
wg.Wait()
|
||||
|
||||
if stdoutErr != nil {
|
||||
glog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error reading stdout for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stdout for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if stderrErr != nil {
|
||||
glog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
klog.Errorf("Error reading stderr for plugin %q: error - %v", rule.Path, err)
|
||||
return cpmtypes.Unknown, "Error reading stderr for plugin. Please check the error log"
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
if _, ok := err.(*exec.ExitError); !ok {
|
||||
glog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
klog.Errorf("Error in waiting for plugin %q: error - %v. output - %q", rule.Path, err, string(stdout))
|
||||
return cpmtypes.Unknown, "Error in waiting for plugin. Please check the error log"
|
||||
}
|
||||
}
|
||||
|
@ -206,7 +243,11 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
output = string(stdout)
|
||||
output = strings.TrimSpace(output)
|
||||
|
||||
if cmd.ProcessState.Sys().(syscall.WaitStatus).Signaled() {
|
||||
m.Lock()
|
||||
cmdKilled := timeout
|
||||
m.Unlock()
|
||||
|
||||
if cmdKilled {
|
||||
output = fmt.Sprintf("Timeout when running plugin %q: state - %s. output - %q", rule.Path, cmd.ProcessState.String(), output)
|
||||
}
|
||||
|
||||
|
@ -229,14 +270,15 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
|
|||
}
|
||||
}
|
||||
|
||||
// Stop the plugin.
|
||||
func (p *Plugin) Stop() {
|
||||
p.tomb.Stop()
|
||||
glog.Info("Stop plugin execution")
|
||||
klog.Info("Stop plugin execution")
|
||||
}
|
||||
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel glog.Level) {
|
||||
func logPluginStderr(rule cpmtypes.CustomRule, logs string, logLevel klog.Level) {
|
||||
if len(logs) != 0 {
|
||||
glog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
glog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
klog.V(logLevel).Infof("Start logs from plugin %+v \n %s", rule, logs)
|
||||
klog.V(logLevel).Infof("End logs from plugin %+v", rule)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package plugin
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
|
@ -25,6 +26,13 @@ import (
|
|||
|
||||
func TestNewPluginRun(t *testing.T) {
|
||||
ruleTimeout := 1 * time.Second
|
||||
timeoutExitStatus := cpmtypes.Unknown
|
||||
ext := "sh"
|
||||
|
||||
if runtime.GOOS == "windows" {
|
||||
ext = "cmd"
|
||||
timeoutExitStatus = cpmtypes.NonOK
|
||||
}
|
||||
|
||||
utMetas := map[string]struct {
|
||||
Rule cpmtypes.CustomRule
|
||||
|
@ -33,7 +41,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
}{
|
||||
"ok": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/ok.sh",
|
||||
Path: "./test-data/ok." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.OK,
|
||||
|
@ -41,7 +49,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"non-ok": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/non-ok.sh",
|
||||
Path: "./test-data/non-ok." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.NonOK,
|
||||
|
@ -49,7 +57,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"unknown": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/unknown.sh",
|
||||
Path: "./test-data/unknown." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.Unknown,
|
||||
|
@ -57,6 +65,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"non executable": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
// Intentionally run .sh for Windows, this is meant to be not executable.
|
||||
Path: "./test-data/non-executable.sh",
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
|
@ -65,7 +74,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"longer than 80 stdout with ok exit status": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/longer-than-80-stdout-with-ok-exit-status.sh",
|
||||
Path: "./test-data/longer-than-80-stdout-with-ok-exit-status." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.OK,
|
||||
|
@ -73,7 +82,7 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"non defined exit status": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/non-defined-exit-status.sh",
|
||||
Path: "./test-data/non-defined-exit-status." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.Unknown,
|
||||
|
@ -81,29 +90,32 @@ func TestNewPluginRun(t *testing.T) {
|
|||
},
|
||||
"sleep 3 second with ok exit status": {
|
||||
Rule: cpmtypes.CustomRule{
|
||||
Path: "./test-data/sleep-3-second-with-ok-exit-status.sh",
|
||||
Path: "./test-data/sleep-3-second-with-ok-exit-status." + ext,
|
||||
Timeout: &ruleTimeout,
|
||||
},
|
||||
ExitStatus: cpmtypes.Unknown,
|
||||
Output: `Timeout when running plugin "./test-data/sleep-3-second-with-ok-exit-status.sh": state - signal: killed. output - ""`,
|
||||
ExitStatus: timeoutExitStatus,
|
||||
Output: `Timeout when running plugin "./test-data/sleep-3-second-with-ok-exit-status.` + ext + `": state - signal: killed. output - ""`,
|
||||
},
|
||||
}
|
||||
|
||||
conf := cpmtypes.CustomPluginConfig{}
|
||||
(&conf).ApplyConfiguration()
|
||||
p := Plugin{config: conf}
|
||||
for desp, utMeta := range utMetas {
|
||||
gotExitStatus, gotOutput := p.run(utMeta.Rule)
|
||||
// cut at position max_output_length if expected output is longer than max_output_length bytes
|
||||
if len(utMeta.Output) > *p.config.PluginGlobalConfig.MaxOutputLength {
|
||||
utMeta.Output = utMeta.Output[:*p.config.PluginGlobalConfig.MaxOutputLength]
|
||||
}
|
||||
if gotExitStatus != utMeta.ExitStatus || gotOutput != utMeta.Output {
|
||||
t.Errorf("%s", desp)
|
||||
t.Errorf("Error in run plugin and get exit status and output for %q. "+
|
||||
"Got exit status: %v, Expected exit status: %v. "+
|
||||
"Got output: %q, Expected output: %q",
|
||||
utMeta.Rule.Path, gotExitStatus, utMeta.ExitStatus, gotOutput, utMeta.Output)
|
||||
}
|
||||
for k, v := range utMetas {
|
||||
desp := k
|
||||
utMeta := v
|
||||
t.Run(desp, func(t *testing.T) {
|
||||
conf := cpmtypes.CustomPluginConfig{}
|
||||
(&conf).ApplyConfiguration()
|
||||
p := Plugin{config: conf}
|
||||
gotExitStatus, gotOutput := p.run(utMeta.Rule)
|
||||
// cut at position max_output_length if expected output is longer than max_output_length bytes
|
||||
if len(utMeta.Output) > *p.config.PluginGlobalConfig.MaxOutputLength {
|
||||
utMeta.Output = utMeta.Output[:*p.config.PluginGlobalConfig.MaxOutputLength]
|
||||
}
|
||||
if gotExitStatus != utMeta.ExitStatus || gotOutput != utMeta.Output {
|
||||
t.Errorf("Error in run plugin and get exit status and output for %q. "+
|
||||
"Got exit status: %v, Expected exit status: %v. "+
|
||||
"Got output: %q, Expected output: %q",
|
||||
utMeta.Rule.Path, gotExitStatus, utMeta.ExitStatus, gotOutput, utMeta.Output)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
|
||||
echo 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
|
||||
exit 0
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
|
||||
echo NON-DEFINED-EXIT-STATUS
|
||||
exit 100
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
|
||||
echo NonOK
|
||||
exit 1
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
|
||||
echo OK
|
||||
exit 0
|
|
@ -0,0 +1,5 @@
|
|||
@echo off
|
||||
|
||||
ping 127.0.0.1 -n 3 > nul
|
||||
echo SLEEP 3S SECOND
|
||||
exit 0
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
|
||||
echo UNKNOWN
|
||||
exit 3
|
|
@ -33,6 +33,7 @@ var (
|
|||
defaultConcurrency = 3
|
||||
defaultMessageChangeBasedConditionUpdate = false
|
||||
defaultEnableMetricsReporting = true
|
||||
defaultSkipInitialStatus = false
|
||||
|
||||
customPluginName = "custom"
|
||||
)
|
||||
|
@ -52,9 +53,11 @@ type pluginGlobalConfig struct {
|
|||
Concurrency *int `json:"concurrency,omitempty"`
|
||||
// EnableMessageChangeBasedConditionUpdate indicates whether NPD should enable message change based condition update.
|
||||
EnableMessageChangeBasedConditionUpdate *bool `json:"enable_message_change_based_condition_update,omitempty"`
|
||||
// SkipInitialStatus prevents the first status update with default conditions
|
||||
SkipInitialStatus *bool `json:"skip_initial_status,omitempty"`
|
||||
}
|
||||
|
||||
// Custom plugin config is the configuration of custom plugin monitor.
|
||||
// CustomPluginConfig is the configuration of custom plugin monitor.
|
||||
type CustomPluginConfig struct {
|
||||
// Plugin is the name of plugin which is currently used.
|
||||
// Currently supported: custom.
|
||||
|
@ -105,6 +108,10 @@ func (cpc *CustomPluginConfig) ApplyConfiguration() error {
|
|||
cpc.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate = &defaultMessageChangeBasedConditionUpdate
|
||||
}
|
||||
|
||||
if cpc.PluginGlobalConfig.SkipInitialStatus == nil {
|
||||
cpc.PluginGlobalConfig.SkipInitialStatus = &defaultSkipInitialStatus
|
||||
}
|
||||
|
||||
for _, rule := range cpc.Rules {
|
||||
if rule.TimeoutString != nil {
|
||||
timeout, err := time.ParseDuration(*rule.TimeoutString)
|
||||
|
|
|
@ -33,6 +33,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
concurrency := 2
|
||||
messageChangeBasedConditionUpdate := true
|
||||
disableMetricsReporting := false
|
||||
disableInitialStatusUpdate := true
|
||||
|
||||
ruleTimeout := 1 * time.Second
|
||||
ruleTimeoutString := ruleTimeout.String()
|
||||
|
@ -62,6 +63,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
Rules: []*CustomRule{
|
||||
|
@ -91,6 +93,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -110,6 +113,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -129,6 +133,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &maxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -148,6 +153,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &concurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -167,6 +173,7 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &messageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
|
@ -184,10 +191,30 @@ func TestCustomPluginConfigApplyConfiguration(t *testing.T) {
|
|||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &defaultSkipInitialStatus,
|
||||
},
|
||||
EnableMetricsReporting: &disableMetricsReporting,
|
||||
},
|
||||
},
|
||||
"disable status update during initialization": {
|
||||
Orig: CustomPluginConfig{PluginGlobalConfig: pluginGlobalConfig{
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
},
|
||||
Wanted: CustomPluginConfig{
|
||||
PluginGlobalConfig: pluginGlobalConfig{
|
||||
InvokeIntervalString: &defaultInvokeIntervalString,
|
||||
InvokeInterval: &defaultInvokeInterval,
|
||||
TimeoutString: &defaultGlobalTimeoutString,
|
||||
Timeout: &defaultGlobalTimeout,
|
||||
MaxOutputLength: &defaultMaxOutputLength,
|
||||
Concurrency: &defaultConcurrency,
|
||||
EnableMessageChangeBasedConditionUpdate: &defaultMessageChangeBasedConditionUpdate,
|
||||
SkipInitialStatus: &disableInitialStatusUpdate,
|
||||
},
|
||||
EnableMetricsReporting: &defaultEnableMetricsReporting,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for desp, utMeta := range utMetas {
|
||||
|
|
|
@ -17,8 +17,9 @@ limitations under the License.
|
|||
package types
|
||||
|
||||
import (
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
)
|
||||
|
||||
type Status int
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -25,10 +26,10 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/types"
|
||||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -49,7 +50,7 @@ const (
|
|||
// not. This addresses 3).
|
||||
type ConditionManager interface {
|
||||
// Start starts the condition manager.
|
||||
Start()
|
||||
Start(ctx context.Context)
|
||||
// UpdateCondition updates a specific condition.
|
||||
UpdateCondition(types.Condition)
|
||||
// GetConditions returns all current conditions.
|
||||
|
@ -67,7 +68,7 @@ type conditionManager struct {
|
|||
// No lock is needed in `sync`, because it is in the same goroutine with the
|
||||
// write operation.
|
||||
sync.RWMutex
|
||||
clock clock.Clock
|
||||
clock clock.WithTicker
|
||||
latestTry time.Time
|
||||
resyncNeeded bool
|
||||
client problemclient.Client
|
||||
|
@ -78,18 +79,18 @@ type conditionManager struct {
|
|||
}
|
||||
|
||||
// NewConditionManager creates a condition manager.
|
||||
func NewConditionManager(client problemclient.Client, clock clock.Clock, heartbeatPeriod time.Duration) ConditionManager {
|
||||
func NewConditionManager(client problemclient.Client, clockInUse clock.WithTicker, heartbeatPeriod time.Duration) ConditionManager {
|
||||
return &conditionManager{
|
||||
client: client,
|
||||
clock: clock,
|
||||
clock: clockInUse,
|
||||
updates: make(map[string]types.Condition),
|
||||
conditions: make(map[string]types.Condition),
|
||||
heartbeatPeriod: heartbeatPeriod,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *conditionManager) Start() {
|
||||
go c.syncLoop()
|
||||
func (c *conditionManager) Start(ctx context.Context) {
|
||||
go c.syncLoop(ctx)
|
||||
}
|
||||
|
||||
func (c *conditionManager) UpdateCondition(condition types.Condition) {
|
||||
|
@ -110,15 +111,17 @@ func (c *conditionManager) GetConditions() []types.Condition {
|
|||
return conditions
|
||||
}
|
||||
|
||||
func (c *conditionManager) syncLoop() {
|
||||
func (c *conditionManager) syncLoop(ctx context.Context) {
|
||||
ticker := c.clock.NewTicker(updatePeriod)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C():
|
||||
if c.needUpdates() || c.needResync() || c.needHeartbeat() {
|
||||
c.sync()
|
||||
c.sync(ctx)
|
||||
}
|
||||
case <-ctx.Done():
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -150,16 +153,16 @@ func (c *conditionManager) needHeartbeat() bool {
|
|||
}
|
||||
|
||||
// sync synchronizes node conditions with the apiserver.
|
||||
func (c *conditionManager) sync() {
|
||||
func (c *conditionManager) sync(ctx context.Context) {
|
||||
c.latestTry = c.clock.Now()
|
||||
c.resyncNeeded = false
|
||||
conditions := []v1.NodeCondition{}
|
||||
for i := range c.conditions {
|
||||
conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))
|
||||
}
|
||||
if err := c.client.SetConditions(conditions); err != nil {
|
||||
if err := c.client.SetConditions(ctx, conditions); err != nil {
|
||||
// The conditions will be updated again in future sync
|
||||
glog.Errorf("failed to update node conditions: %v", err)
|
||||
klog.Errorf("failed to update node conditions: %v", err)
|
||||
c.resyncNeeded = true
|
||||
return
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package condition
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -27,15 +28,15 @@ import (
|
|||
"k8s.io/node-problem-detector/pkg/types"
|
||||
problemutil "k8s.io/node-problem-detector/pkg/util"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
)
|
||||
|
||||
const heartbeatPeriod = 1 * time.Minute
|
||||
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *clock.FakeClock) {
|
||||
func newTestManager() (*conditionManager, *problemclient.FakeProblemClient, *testclock.FakeClock) {
|
||||
fakeClient := problemclient.NewFakeProblemClient()
|
||||
fakeClock := clock.NewFakeClock(time.Now())
|
||||
fakeClock := testclock.NewFakeClock(time.Now())
|
||||
manager := NewConditionManager(fakeClient, fakeClock, heartbeatPeriod)
|
||||
return manager.(*conditionManager), fakeClient, fakeClock
|
||||
}
|
||||
|
@ -53,33 +54,43 @@ func newTestCondition(condition string) types.Condition {
|
|||
func TestNeedUpdates(t *testing.T) {
|
||||
m, _, _ := newTestManager()
|
||||
var c types.Condition
|
||||
for desc, test := range map[string]struct {
|
||||
for _, testCase := range []struct {
|
||||
name string
|
||||
condition string
|
||||
update bool
|
||||
}{
|
||||
"Init condition needs update": {
|
||||
{
|
||||
name: "Init condition needs update",
|
||||
condition: "TestCondition",
|
||||
update: true,
|
||||
},
|
||||
"Same condition doesn't need update": {
|
||||
{
|
||||
name: "Same condition doesn't need update",
|
||||
// not set condition, the test will reuse the condition in last case.
|
||||
update: false,
|
||||
},
|
||||
"Same condition with different timestamp need update": {
|
||||
{
|
||||
name: "Same condition with different timestamp need update",
|
||||
condition: "TestCondition",
|
||||
update: true,
|
||||
},
|
||||
"New condition needs update": {
|
||||
{
|
||||
name: "New condition needs update",
|
||||
condition: "TestConditionNew",
|
||||
update: true,
|
||||
},
|
||||
} {
|
||||
if test.condition != "" {
|
||||
c = newTestCondition(test.condition)
|
||||
tc := testCase
|
||||
t.Log(tc.name)
|
||||
if tc.condition != "" {
|
||||
// Guarantee that the time advances before creating a new condition.
|
||||
for now := time.Now(); now == time.Now(); {
|
||||
}
|
||||
c = newTestCondition(tc.condition)
|
||||
}
|
||||
m.UpdateCondition(c)
|
||||
assert.Equal(t, test.update, m.needUpdates(), desc)
|
||||
assert.Equal(t, c, m.conditions[c.Type], desc)
|
||||
assert.Equal(t, tc.update, m.needUpdates(), tc.name)
|
||||
assert.Equal(t, c, m.conditions[c.Type], tc.name)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -99,7 +110,7 @@ func TestResync(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
@ -108,7 +119,7 @@ func TestResync(t *testing.T) {
|
|||
assert.False(t, m.needResync(), "Should not resync after resync period without resync needed")
|
||||
|
||||
fakeClient.InjectError("SetConditions", fmt.Errorf("injected error"))
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
|
||||
assert.False(t, m.needResync(), "Should not resync before resync period")
|
||||
fakeClock.Step(resyncPeriod)
|
||||
|
@ -119,7 +130,7 @@ func TestHeartbeat(t *testing.T) {
|
|||
m, fakeClient, fakeClock := newTestManager()
|
||||
condition := newTestCondition("TestCondition")
|
||||
m.conditions = map[string]types.Condition{condition.Type: condition}
|
||||
m.sync()
|
||||
m.sync(context.Background())
|
||||
expected := []v1.NodeCondition{problemutil.ConvertToAPICondition(condition)}
|
||||
assert.Nil(t, fakeClient.AssertConditions(expected), "Condition should be updated via client")
|
||||
|
||||
|
|
|
@ -17,15 +17,16 @@ limitations under the License.
|
|||
package k8sexporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"net/http"
|
||||
_ "net/http/pprof"
|
||||
"net/http/pprof"
|
||||
"strconv"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/exporters/k8sexporter/condition"
|
||||
|
@ -37,6 +38,8 @@ import (
|
|||
type k8sExporter struct {
|
||||
client problemclient.Client
|
||||
conditionManager condition.ConditionManager
|
||||
writeEvents bool
|
||||
updateConditions bool
|
||||
}
|
||||
|
||||
// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting,
|
||||
|
@ -44,35 +47,41 @@ type k8sExporter struct {
|
|||
//
|
||||
// Note that this function may be blocked (until a timeout occurs) before
|
||||
// kube-apiserver becomes ready.
|
||||
func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
func NewExporterOrDie(ctx context.Context, npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
||||
if !npdo.EnableK8sExporter {
|
||||
return nil
|
||||
}
|
||||
|
||||
c := problemclient.NewClientOrDie(npdo)
|
||||
|
||||
glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil {
|
||||
glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
klog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout)
|
||||
if err := waitForAPIServerReadyWithTimeout(ctx, c, npdo); err != nil {
|
||||
klog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err)
|
||||
}
|
||||
|
||||
ke := k8sExporter{
|
||||
client: c,
|
||||
conditionManager: condition.NewConditionManager(c, clock.RealClock{}, npdo.K8sExporterHeartbeatPeriod),
|
||||
writeEvents: npdo.K8sExporterWriteEvents,
|
||||
updateConditions: npdo.K8sExporterUpdateNodeConditions,
|
||||
}
|
||||
|
||||
ke.startHTTPReporting(npdo)
|
||||
ke.conditionManager.Start()
|
||||
ke.conditionManager.Start(ctx)
|
||||
|
||||
return &ke
|
||||
}
|
||||
|
||||
func (ke *k8sExporter) ExportProblems(status *types.Status) {
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
if ke.writeEvents {
|
||||
for _, event := range status.Events {
|
||||
ke.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
|
||||
}
|
||||
}
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
if ke.updateConditions {
|
||||
for _, cdt := range status.Conditions {
|
||||
ke.conditionManager.UpdateCondition(cdt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -94,22 +103,30 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio
|
|||
util.ReturnHTTPJson(w, ke.conditionManager.GetConditions())
|
||||
})
|
||||
|
||||
// register pprof
|
||||
mux.HandleFunc("/debug/pprof/", pprof.Index)
|
||||
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
|
||||
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
|
||||
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
|
||||
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
|
||||
|
||||
addr := net.JoinHostPort(npdo.ServerAddress, strconv.Itoa(npdo.ServerPort))
|
||||
go func() {
|
||||
err := http.ListenAndServe(addr, mux)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to start server: %v", err)
|
||||
klog.Fatalf("Failed to start server: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) {
|
||||
func waitForAPIServerReadyWithTimeout(ctx context.Context, c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error {
|
||||
return wait.PollUntilContextTimeout(ctx, npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, true, func(ctx context.Context) (done bool, err error) {
|
||||
// If NPD can get the node object from kube-apiserver, the server is
|
||||
// ready and the RBAC permission is set correctly.
|
||||
if _, err := c.GetNode(); err == nil {
|
||||
return true, nil
|
||||
if _, err := c.GetNode(ctx); err != nil {
|
||||
klog.Errorf("Can't get node object: %v", err)
|
||||
return false, err
|
||||
}
|
||||
return false, nil
|
||||
return true, nil
|
||||
})
|
||||
}
|
||||
|
|
|
@ -12,12 +12,12 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package kubernetes
|
||||
package problemclient
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
|
@ -57,7 +57,7 @@ func getConfigOverrides(uri *url.URL) (*kubeClientCmd.ConfigOverrides, error) {
|
|||
return &kubeConfigOverride, nil
|
||||
}
|
||||
|
||||
func GetKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
||||
func getKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
||||
var (
|
||||
kubeConfig *kube_rest.Config
|
||||
err error
|
||||
|
@ -137,7 +137,7 @@ func GetKubeClientConfig(uri *url.URL) (*kube_rest.Config, error) {
|
|||
|
||||
if useServiceAccount {
|
||||
// If a readable service account token exists, then use it
|
||||
if contents, err := ioutil.ReadFile(defaultServiceAccountFile); err == nil {
|
||||
if contents, err := os.ReadFile(defaultServiceAccountFile); err == nil {
|
||||
kubeConfig.BearerToken = string(contents)
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
@ -60,7 +61,7 @@ func (f *FakeProblemClient) AssertConditions(expected []v1.NodeCondition) error
|
|||
}
|
||||
|
||||
// SetConditions is a fake mimic of SetConditions, it only update the internal condition cache.
|
||||
func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
|
||||
func (f *FakeProblemClient) SetConditions(ctx context.Context, conditions []v1.NodeCondition) error {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["SetConditions"]; ok {
|
||||
|
@ -73,7 +74,7 @@ func (f *FakeProblemClient) SetConditions(conditions []v1.NodeCondition) error {
|
|||
}
|
||||
|
||||
// GetConditions is a fake mimic of GetConditions, it returns the conditions cached internally.
|
||||
func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
func (f *FakeProblemClient) GetConditions(ctx context.Context, types []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
f.Lock()
|
||||
defer f.Unlock()
|
||||
if err, ok := f.errors["GetConditions"]; ok {
|
||||
|
@ -93,6 +94,6 @@ func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.N
|
|||
func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) {
|
||||
}
|
||||
|
||||
func (f *FakeProblemClient) GetNode() (*v1.Node, error) {
|
||||
func (f *FakeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
return nil, fmt.Errorf("GetNode() not implemented")
|
||||
}
|
||||
|
|
|
@ -17,24 +17,24 @@ limitations under the License.
|
|||
package problemclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/heapster/common/kubernetes"
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/version"
|
||||
)
|
||||
|
@ -42,14 +42,14 @@ import (
|
|||
// Client is the interface of problem client
|
||||
type Client interface {
|
||||
// GetConditions get all specific conditions of current node.
|
||||
GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error)
|
||||
// SetConditions set or update conditions of current node.
|
||||
SetConditions(conditions []v1.NodeCondition) error
|
||||
SetConditions(ctx context.Context, conditionTypes []v1.NodeCondition) error
|
||||
// Eventf reports the event.
|
||||
Eventf(eventType string, source, reason, messageFmt string, args ...interface{})
|
||||
// GetNode returns the Node object of the node on which the
|
||||
// node-problem-detector runs.
|
||||
GetNode() (*v1.Node, error)
|
||||
GetNode(ctx context.Context) (*v1.Node, error)
|
||||
}
|
||||
|
||||
type nodeProblemClient struct {
|
||||
|
@ -68,13 +68,14 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
// we have checked it is a valid URI after command line argument is parsed.:)
|
||||
uri, _ := url.Parse(npdo.ApiServerOverride)
|
||||
|
||||
cfg, err := kubernetes.GetKubeClientConfig(uri)
|
||||
cfg, err := getKubeClientConfig(uri)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version())
|
||||
// TODO(random-liu): Set QPS Limit
|
||||
cfg.QPS = npdo.QPS
|
||||
cfg.Burst = npdo.Burst
|
||||
c.client = clientset.NewForConfigOrDie(cfg).CoreV1()
|
||||
c.nodeName = npdo.NodeName
|
||||
c.eventNamespace = npdo.EventNamespace
|
||||
|
@ -83,8 +84,8 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client {
|
|||
return c
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode()
|
||||
func (c *nodeProblemClient) GetConditions(ctx context.Context, conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) {
|
||||
node, err := c.GetNode(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -99,7 +100,7 @@ func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType)
|
|||
return conditions, nil
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) error {
|
||||
func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v1.NodeCondition) error {
|
||||
for i := range newConditions {
|
||||
// Each time we update the conditions, we update the heart beat time
|
||||
newConditions[i].LastHeartbeatTime = metav1.NewTime(c.clock.Now())
|
||||
|
@ -108,7 +109,15 @@ func (c *nodeProblemClient) SetConditions(newConditions []v1.NodeCondition) erro
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.client.RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(c.nodeName).SubResource("status").Body(patch).Do().Error()
|
||||
return retry.OnError(retry.DefaultRetry,
|
||||
func(error) bool {
|
||||
return true
|
||||
},
|
||||
func() error {
|
||||
_, err := c.client.Nodes().PatchStatus(ctx, c.nodeName, patch)
|
||||
return err
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, args ...interface{}) {
|
||||
|
@ -121,8 +130,10 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string,
|
|||
recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...)
|
||||
}
|
||||
|
||||
func (c *nodeProblemClient) GetNode() (*v1.Node, error) {
|
||||
return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{})
|
||||
func (c *nodeProblemClient) GetNode(ctx context.Context) (*v1.Node, error) {
|
||||
// To reduce the load on APIServer & etcd, we are serving GET operations from
|
||||
// apiserver cache (the data might be slightly delayed).
|
||||
return c.client.Nodes().Get(ctx, c.nodeName, metav1.GetOptions{ResourceVersion: "0"})
|
||||
}
|
||||
|
||||
// generatePatch generates condition patch
|
||||
|
@ -137,8 +148,8 @@ func generatePatch(conditions []v1.NodeCondition) ([]byte, error) {
|
|||
// getEventRecorder generates a recorder for specific node name and source.
|
||||
func getEventRecorder(c typedcorev1.CoreV1Interface, namespace, nodeName, source string) record.EventRecorder {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
eventBroadcaster.StartLogging(glog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartLogging(klog.V(4).Infof)
|
||||
recorder := eventBroadcaster.NewRecorder(runtime.NewScheme(), v1.EventSource{Component: source, Host: nodeName})
|
||||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events(namespace)})
|
||||
return recorder
|
||||
}
|
||||
|
|
|
@ -22,10 +22,10 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/client-go/tools/record"
|
||||
testclock "k8s.io/utils/clock/testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
@ -40,7 +40,7 @@ func newFakeProblemClient() *nodeProblemClient {
|
|||
nodeName: testNode,
|
||||
// There is no proper fake for *client.Client for now
|
||||
// TODO(random-liu): Add test for SetConditions when we have good fake for *client.Client
|
||||
clock: &clock.FakeClock{},
|
||||
clock: testclock.NewFakeClock(time.Now()),
|
||||
recorders: make(map[string]record.EventRecorder),
|
||||
nodeRef: getNodeRef("", testNode),
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ import (
|
|||
"strconv"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/prometheus"
|
||||
"github.com/golang/glog"
|
||||
"go.opencensus.io/stats/view"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/options"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -40,13 +40,13 @@ func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter {
|
|||
addr := net.JoinHostPort(npdo.PrometheusServerAddress, strconv.Itoa(npdo.PrometheusServerPort))
|
||||
pe, err := prometheus.NewExporter(prometheus.Options{})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
klog.Fatalf("Failed to create Prometheus exporter: %v", err)
|
||||
}
|
||||
go func() {
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", pe)
|
||||
if err := http.ListenAndServe(addr, mux); err != nil {
|
||||
glog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
klog.Fatalf("Failed to start Prometheus scrape endpoint: %v", err)
|
||||
}
|
||||
}()
|
||||
view.RegisterExporter(pe)
|
||||
|
|
|
@ -18,7 +18,7 @@ package gce
|
|||
|
||||
import (
|
||||
"cloud.google.com/go/compute/metadata"
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
type Metadata struct {
|
||||
|
@ -37,7 +37,7 @@ func (md *Metadata) HasMissingField() bool {
|
|||
|
||||
func (md *Metadata) PopulateFromGCE() error {
|
||||
var err error
|
||||
glog.Info("Fetching GCE metadata from metadata server")
|
||||
klog.Info("Fetching GCE metadata from metadata server")
|
||||
if md.ProjectID == "" {
|
||||
md.ProjectID, err = metadata.ProjectID()
|
||||
if err != nil {
|
||||
|
|
|
@ -18,19 +18,19 @@ package stackdriverexporter
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"contrib.go.opencensus.io/exporter/stackdriver"
|
||||
monitoredres "contrib.go.opencensus.io/exporter/stackdriver/monitoredresource"
|
||||
"github.com/golang/glog"
|
||||
"github.com/spf13/pflag"
|
||||
"go.opencensus.io/stats/view"
|
||||
"google.golang.org/api/option"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"github.com/avast/retry-go"
|
||||
"github.com/avast/retry-go/v4"
|
||||
"k8s.io/node-problem-detector/pkg/exporters"
|
||||
seconfig "k8s.io/node-problem-detector/pkg/exporters/stackdriver/config"
|
||||
"k8s.io/node-problem-detector/pkg/types"
|
||||
|
@ -49,8 +49,12 @@ const exporterName = "stackdriver"
|
|||
var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
||||
metrics.CPURunnableTaskCountID: "compute.googleapis.com/guest/cpu/runnable_task_count",
|
||||
metrics.CPUUsageTimeID: "compute.googleapis.com/guest/cpu/usage_time",
|
||||
metrics.CPULoad1m: "compute.googleapis.com/guest/cpu/load_1m",
|
||||
metrics.CPULoad5m: "compute.googleapis.com/guest/cpu/load_5m",
|
||||
metrics.CPULoad15m: "compute.googleapis.com/guest/cpu/load_15m",
|
||||
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
|
||||
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
|
||||
metrics.DiskPercentUsedID: "compute.googleapis.com/guest/disk/percent_used",
|
||||
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
|
||||
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
|
||||
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
|
||||
|
@ -63,8 +67,31 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
|
|||
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
|
||||
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
|
||||
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
|
||||
metrics.MemoryPercentUsedID: "compute.googleapis.com/guest/memory/percent_used",
|
||||
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
|
||||
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
|
||||
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
|
||||
metrics.SystemProcessesTotal: "kubernetes.io/internal/node/guest/system/processes_total",
|
||||
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
|
||||
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
|
||||
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
|
||||
metrics.SystemCPUStat: "kubernetes.io/internal/node/guest/system/cpu_stat",
|
||||
metrics.NetDevRxBytes: "kubernetes.io/internal/node/guest/net/rx_bytes",
|
||||
metrics.NetDevRxPackets: "kubernetes.io/internal/node/guest/net/rx_packets",
|
||||
metrics.NetDevRxErrors: "kubernetes.io/internal/node/guest/net/rx_errors",
|
||||
metrics.NetDevRxDropped: "kubernetes.io/internal/node/guest/net/rx_dropped",
|
||||
metrics.NetDevRxFifo: "kubernetes.io/internal/node/guest/net/rx_fifo",
|
||||
metrics.NetDevRxFrame: "kubernetes.io/internal/node/guest/net/rx_frame",
|
||||
metrics.NetDevRxCompressed: "kubernetes.io/internal/node/guest/net/rx_compressed",
|
||||
metrics.NetDevRxMulticast: "kubernetes.io/internal/node/guest/net/rx_multicast",
|
||||
metrics.NetDevTxBytes: "kubernetes.io/internal/node/guest/net/tx_bytes",
|
||||
metrics.NetDevTxPackets: "kubernetes.io/internal/node/guest/net/tx_packets",
|
||||
metrics.NetDevTxErrors: "kubernetes.io/internal/node/guest/net/tx_errors",
|
||||
metrics.NetDevTxDropped: "kubernetes.io/internal/node/guest/net/tx_dropped",
|
||||
metrics.NetDevTxFifo: "kubernetes.io/internal/node/guest/net/tx_fifo",
|
||||
metrics.NetDevTxCollisions: "kubernetes.io/internal/node/guest/net/tx_collisions",
|
||||
metrics.NetDevTxCarrier: "kubernetes.io/internal/node/guest/net/tx_carrier",
|
||||
metrics.NetDevTxCompressed: "kubernetes.io/internal/node/guest/net/tx_compressed",
|
||||
}
|
||||
|
||||
func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {
|
||||
|
@ -112,12 +139,12 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
DefaultMonitoringLabels: &globalLabels,
|
||||
})
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
klog.Fatalf("Failed to create Stackdriver OpenCensus view exporter: %v", err)
|
||||
}
|
||||
|
||||
exportPeriod, err := time.ParseDuration(se.config.ExportPeriod)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
klog.Fatalf("Failed to parse ExportPeriod %q: %v", se.config.ExportPeriod, err)
|
||||
}
|
||||
|
||||
view.SetReportingPeriod(exportPeriod)
|
||||
|
@ -126,33 +153,33 @@ func (se *stackdriverExporter) setupOpenCensusViewExporterOrDie() {
|
|||
|
||||
func (se *stackdriverExporter) populateMetadataOrDie() {
|
||||
if !se.config.GCEMetadata.HasMissingField() {
|
||||
glog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
klog.Infof("Using GCE metadata specified in the config file: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
|
||||
metadataFetchTimeout, err := time.ParseDuration(se.config.MetadataFetchTimeout)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
klog.Fatalf("Failed to parse MetadataFetchTimeout %q: %v", se.config.MetadataFetchTimeout, err)
|
||||
}
|
||||
|
||||
metadataFetchInterval, err := time.ParseDuration(se.config.MetadataFetchInterval)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
klog.Fatalf("Failed to parse MetadataFetchInterval %q: %v", se.config.MetadataFetchInterval, err)
|
||||
}
|
||||
|
||||
glog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
klog.Infof("Populating GCE metadata by querying GCE metadata server.")
|
||||
err = retry.Do(se.config.GCEMetadata.PopulateFromGCE,
|
||||
retry.Delay(metadataFetchInterval),
|
||||
retry.Attempts(uint(metadataFetchTimeout/metadataFetchInterval)),
|
||||
retry.DelayType(retry.FixedDelay))
|
||||
if err == nil {
|
||||
glog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
klog.Infof("Using GCE metadata: %+v", se.config.GCEMetadata)
|
||||
return
|
||||
}
|
||||
if se.config.PanicOnMetadataFetchFailure {
|
||||
glog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
klog.Fatalf("Failed to populate GCE metadata: %v", err)
|
||||
} else {
|
||||
glog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
klog.Errorf("Failed to populate GCE metadata: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -175,7 +202,7 @@ func (clo *commandLineOptions) SetFlags(fs *pflag.FlagSet) {
|
|||
func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
||||
options, ok := clo.(*commandLineOptions)
|
||||
if !ok {
|
||||
glog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
klog.Fatalf("Wrong type for the command line options of Stackdriver Exporter: %s.", reflect.TypeOf(clo))
|
||||
}
|
||||
if options.configPath == "" {
|
||||
return nil
|
||||
|
@ -184,17 +211,17 @@ func NewExporterOrDie(clo types.CommandLineOptions) types.Exporter {
|
|||
se := stackdriverExporter{}
|
||||
|
||||
// Apply configurations.
|
||||
f, err := ioutil.ReadFile(options.configPath)
|
||||
f, err := os.ReadFile(options.configPath)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
klog.Fatalf("Failed to read configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
err = json.Unmarshal(f, &se.config)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
klog.Fatalf("Failed to unmarshal configuration file %q: %v", options.configPath, err)
|
||||
}
|
||||
se.config.ApplyConfiguration()
|
||||
|
||||
glog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
klog.Infof("Starting Stackdriver exporter %s", options.configPath)
|
||||
|
||||
se.populateMetadataOrDie()
|
||||
se.setupOpenCensusViewExporterOrDie()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !disable_stackdriver_exporter
|
||||
// +build !disable_stackdriver_exporter
|
||||
|
||||
/*
|
||||
|
|
|
@ -18,22 +18,21 @@ package healthchecker
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
type healthChecker struct {
|
||||
component string
|
||||
service string
|
||||
enableRepair bool
|
||||
healthCheckFunc func() bool
|
||||
healthCheckFunc func() (bool, error)
|
||||
// The repair is "best-effort" and ignores the error from the underlying actions.
|
||||
// The bash commands to kill the process will fail if the service is down and hence ignore.
|
||||
repairFunc func()
|
||||
|
@ -41,6 +40,8 @@ type healthChecker struct {
|
|||
crictlPath string
|
||||
healthCheckTimeout time.Duration
|
||||
coolDownTime time.Duration
|
||||
loopBackTime time.Duration
|
||||
logPatternsToCheck map[string]int
|
||||
}
|
||||
|
||||
// NewHealthChecker returns a new health checker configured with the given options.
|
||||
|
@ -51,120 +52,134 @@ func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, e
|
|||
crictlPath: hco.CriCtlPath,
|
||||
healthCheckTimeout: hco.HealthCheckTimeout,
|
||||
coolDownTime: hco.CoolDownTime,
|
||||
service: hco.Service,
|
||||
loopBackTime: hco.LoopBackTime,
|
||||
logPatternsToCheck: hco.LogPatterns.GetLogPatternCountMap(),
|
||||
}
|
||||
hc.healthCheckFunc = getHealthCheckFunc(hco)
|
||||
hc.repairFunc = getRepairFunc(hco)
|
||||
hc.uptimeFunc = getUptimeFunc(hco.SystemdService)
|
||||
hc.uptimeFunc = getUptimeFunc(hco.Service)
|
||||
return hc, nil
|
||||
}
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
// Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will
|
||||
// transition from inactive -> activating and the timestamp is captured.
|
||||
// Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/
|
||||
// Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when
|
||||
// RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in
|
||||
// activating state and hence ActiveEnterTimestamp was never updated.
|
||||
out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp")
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
val := strings.Split(out, "=")
|
||||
if len(val) < 2 {
|
||||
return time.Duration(0), errors.New("could not parse the service uptime time correctly")
|
||||
}
|
||||
t, err := time.Parse(types.UptimeTimeLayout, val[1])
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
return time.Since(t), nil
|
||||
}
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
switch hco.Component {
|
||||
case types.DockerComponent:
|
||||
// Use "docker ps" for docker health check. Not using crictl for docker to remove
|
||||
// dependency on the kubelet.
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd")
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
|
||||
}
|
||||
default:
|
||||
// Just kill the service for all other components
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return func() bool {
|
||||
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
|
||||
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
case types.DockerComponent:
|
||||
return func() bool {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() bool {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CheckHealth checks for the health of the component and tries to repair if enabled.
|
||||
// Returns true if healthy, false otherwise.
|
||||
func (hc *healthChecker) CheckHealth() bool {
|
||||
healthy := hc.healthCheckFunc()
|
||||
if healthy {
|
||||
return true
|
||||
func (hc *healthChecker) CheckHealth() (bool, error) {
|
||||
healthy, err := hc.healthCheckFunc()
|
||||
if err != nil {
|
||||
return healthy, err
|
||||
}
|
||||
logPatternHealthy, err := logPatternHealthCheck(hc.service, hc.loopBackTime, hc.logPatternsToCheck)
|
||||
if err != nil {
|
||||
return logPatternHealthy, err
|
||||
}
|
||||
if healthy && logPatternHealthy {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// The service is unhealthy.
|
||||
// Attempt repair based on flag.
|
||||
if hc.enableRepair {
|
||||
// repair if the service has been up for the cool down period.
|
||||
uptime, err := hc.uptimeFunc()
|
||||
if err != nil {
|
||||
glog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
klog.Infof("error in getting uptime for %v: %v\n", hc.component, err)
|
||||
return false, nil
|
||||
}
|
||||
glog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
klog.Infof("%v is unhealthy, component uptime: %v\n", hc.component, uptime)
|
||||
if uptime > hc.coolDownTime {
|
||||
glog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
klog.Infof("%v cooldown period of %v exceeded, repairing", hc.component, hc.coolDownTime)
|
||||
hc.repairFunc()
|
||||
}
|
||||
}
|
||||
return false
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// logPatternHealthCheck checks for the provided logPattern occurrences in the service logs.
|
||||
// Returns true if the pattern is empty or does not exist logThresholdCount times since start of service, false otherwise.
|
||||
func logPatternHealthCheck(service string, loopBackTime time.Duration, logPatternsToCheck map[string]int) (bool, error) {
|
||||
if len(logPatternsToCheck) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
uptimeFunc := getUptimeFunc(service)
|
||||
klog.Infof("Getting uptime for service: %v\n", service)
|
||||
uptime, err := uptimeFunc()
|
||||
if err != nil {
|
||||
klog.Warningf("Failed to get the uptime: %+v", err)
|
||||
return true, err
|
||||
}
|
||||
|
||||
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
|
||||
if loopBackTime > 0 && uptime > loopBackTime {
|
||||
logStartTime = time.Now().Add(-loopBackTime).Format(types.LogParsingTimeLayout)
|
||||
}
|
||||
for pattern, count := range logPatternsToCheck {
|
||||
healthy, err := checkForPattern(service, logStartTime, pattern, count)
|
||||
if err != nil || !healthy {
|
||||
return healthy, err
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// healthCheckEndpointOKFunc returns a function to check the status of an http endpoint
|
||||
func healthCheckEndpointOKFunc(endpoint string, timeout time.Duration) func() (bool, error) {
|
||||
return func() (bool, error) {
|
||||
httpClient := http.Client{Timeout: timeout}
|
||||
response, err := httpClient.Get(endpoint)
|
||||
if err != nil || response.StatusCode != http.StatusOK {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// getHealthCheckFunc returns the health check function based on the component.
|
||||
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
|
||||
switch hco.Component {
|
||||
case types.KubeletComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeletHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.KubeProxyComponent:
|
||||
return healthCheckEndpointOKFunc(types.KubeProxyHealthCheckEndpoint(), hco.HealthCheckTimeout)
|
||||
case types.DockerComponent:
|
||||
return func() (bool, error) {
|
||||
if _, err := execCommand(hco.HealthCheckTimeout, getDockerPath(), "ps"); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
case types.CRIComponent:
|
||||
return func() (bool, error) {
|
||||
_, err := execCommand(
|
||||
hco.HealthCheckTimeout,
|
||||
hco.CriCtlPath,
|
||||
"--timeout="+hco.CriTimeout.String(),
|
||||
"--runtime-endpoint="+hco.CriSocketPath,
|
||||
"pods",
|
||||
"--latest",
|
||||
)
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
default:
|
||||
klog.Warningf("Unsupported component: %v", hco.Component)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
|
||||
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, command, args...)
|
||||
out, err := cmd.Output()
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
glog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
klog.Infof("command %v failed: %v, %s\n", cmd, err, string(out))
|
||||
return "", err
|
||||
}
|
||||
|
||||
return strings.TrimSuffix(string(out), "\n"), nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
)
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
klog.Fatalf("getUptimeFunc is not supported in %s", runtime.GOOS)
|
||||
return func() (time.Duration, error) { return time.Second, nil }
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
klog.Fatalf("getRepairFunc is not supported in %s", runtime.GOOS)
|
||||
return func() {}
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
klog.Fatalf("checkForPattern is not supported in %s", runtime.GOOS)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
klog.Fatalf("getDockerPath is not supported in %s", runtime.GOOS)
|
||||
return ""
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
Copyright 2020 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
// Using InactiveExitTimestamp to capture the exact time when systemd tried starting the service. The service will
|
||||
// transition from inactive -> activating and the timestamp is captured.
|
||||
// Source : https://www.freedesktop.org/wiki/Software/systemd/dbus/
|
||||
// Using ActiveEnterTimestamp resulted in race condition where the service was repeatedly killed by plugin when
|
||||
// RestartSec of systemd and invoke interval of plugin got in sync. The service was repeatedly killed in
|
||||
// activating state and hence ActiveEnterTimestamp was never updated.
|
||||
out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=InactiveExitTimestamp")
|
||||
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
val := strings.Split(out, "=")
|
||||
if len(val) < 2 {
|
||||
return time.Duration(0), errors.New("could not parse the service uptime time correctly")
|
||||
}
|
||||
t, err := time.Parse(types.UptimeTimeLayout, val[1])
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
return time.Since(t), nil
|
||||
}
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
// Use `systemctl kill` instead of `systemctl restart` for the repair function.
|
||||
// We start to rely on the kernel message difference for the two commands to
|
||||
// indicate if the component restart is due to an administrative plan (restart)
|
||||
// or a system issue that needs repair (kill).
|
||||
// See https://github.com/kubernetes/node-problem-detector/issues/847.
|
||||
switch hco.Component {
|
||||
case types.DockerComponent:
|
||||
// Use "docker ps" for docker health check. Not using crictl for docker to remove
|
||||
// dependency on the kubelet.
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd")
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service)
|
||||
}
|
||||
default:
|
||||
// Just kill the service for all other components
|
||||
return func() {
|
||||
execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
|
||||
// Query service logs since the logStartTime
|
||||
`journalctl --unit "`+service+`" --since "`+logStartTime+
|
||||
// Grep the pattern
|
||||
`" | grep -i "`+logPattern+
|
||||
// Get the count of occurrences
|
||||
`" | wc -l`)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
occurrences, err := strconv.Atoi(out)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker"
|
||||
}
|
|
@ -20,12 +20,13 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
)
|
||||
|
||||
var repairCalled bool
|
||||
|
||||
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
|
||||
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() (bool, error), uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
|
||||
repairCalled = false
|
||||
return &healthChecker{
|
||||
enableRepair: enableRepair,
|
||||
|
@ -37,12 +38,12 @@ func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptime
|
|||
}
|
||||
}
|
||||
|
||||
func healthyFunc() bool {
|
||||
return true
|
||||
func healthyFunc() (bool, error) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func unhealthyFunc() bool {
|
||||
return false
|
||||
func unhealthyFunc() (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func repairFunc() {
|
||||
|
@ -62,7 +63,7 @@ func TestHealthCheck(t *testing.T) {
|
|||
description string
|
||||
enableRepair bool
|
||||
healthy bool
|
||||
healthCheckFunc func() bool
|
||||
healthCheckFunc func() (bool, error)
|
||||
uptimeFunc func() (time.Duration, error)
|
||||
repairFunc func()
|
||||
repairCalled bool
|
||||
|
@ -106,7 +107,10 @@ func TestHealthCheck(t *testing.T) {
|
|||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair)
|
||||
healthy := hc.CheckHealth()
|
||||
healthy, err := hc.CheckHealth()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error occurred got %v; expected nil", err)
|
||||
}
|
||||
if healthy != tc.healthy {
|
||||
t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy)
|
||||
}
|
||||
|
@ -116,3 +120,38 @@ func TestHealthCheck(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestComponentsSupported(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
description string
|
||||
component string
|
||||
}{
|
||||
{
|
||||
description: "Kube Proxy should be supported",
|
||||
component: types.KubeProxyComponent,
|
||||
},
|
||||
{
|
||||
description: "Kubelet should be supported",
|
||||
component: types.KubeletComponent,
|
||||
},
|
||||
{
|
||||
description: "Docker should be supported",
|
||||
component: types.DockerComponent,
|
||||
},
|
||||
{
|
||||
description: "CRI should be supported",
|
||||
component: types.CRIComponent,
|
||||
},
|
||||
} {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
checkFunc := getHealthCheckFunc(&options.HealthCheckerOptions{
|
||||
Component: tc.component,
|
||||
})
|
||||
if checkFunc == nil {
|
||||
t.Errorf("component %v should be supported", tc.component)
|
||||
}
|
||||
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package healthchecker
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/node-problem-detector/cmd/healthchecker/options"
|
||||
"k8s.io/node-problem-detector/pkg/healthchecker/types"
|
||||
"k8s.io/node-problem-detector/pkg/util"
|
||||
)
|
||||
|
||||
// getUptimeFunc returns the time for which the given service has been running.
|
||||
func getUptimeFunc(service string) func() (time.Duration, error) {
|
||||
return func() (time.Duration, error) {
|
||||
// To attempt to calculate uptime more efficiently, we attempt to grab the process id to grab the start time.
|
||||
// If the process id does not exist (meaning the service is not running for some reason), we will result to
|
||||
// using the WinEvent Log Objects to find the Service logs' time when the Service last entered running state.
|
||||
// In addition to filtering not by the logname=system we also filter on event id=7036 to reduce the number of
|
||||
// entries the next command Where-Object will have to look through. id 7036 messages indicating a stopped or running service.
|
||||
// The powershell command formats the TimeCreated of the event log in RFC1123Pattern.
|
||||
// However, because the time library parser does not recognize the ',' in this RFC1123Pattern format,
|
||||
// it is manually removed before parsing it using the UptimeTimeLayout.
|
||||
getTimeCreatedCmd := `$ProcessId = (Get-WMIObject -Class Win32_Service -Filter "Name='` + service + `'" | Select-Object -ExpandProperty ProcessId);` +
|
||||
`if ([string]::IsNullOrEmpty($ProcessId) -or $ProcessId -eq 0) { (Get-WinEvent -FilterHashtable @{logname='system';id=7036} ` +
|
||||
`| Where-Object {$_.Message -match '.*(` + service + `).*(running).*'} | Select-Object -Property TimeCreated -First 1 | ` +
|
||||
`foreach {$_.TimeCreated.ToUniversalTime().ToString('R')} | Out-String).Trim() } else { (Get-Process -Id $ProcessId | Select starttime | ` +
|
||||
`foreach {$_.starttime.ToUniversalTime().ToString('R')} | Out-String).Trim() }`
|
||||
out, err := powershell(getTimeCreatedCmd)
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
if out == "" {
|
||||
return time.Duration(0), fmt.Errorf("service time creation not found for %s", service)
|
||||
}
|
||||
out = strings.ReplaceAll(out, ",", "")
|
||||
t, err := time.Parse(types.UptimeTimeLayout, out)
|
||||
if err != nil {
|
||||
return time.Duration(0), err
|
||||
}
|
||||
return time.Since(t), nil
|
||||
}
|
||||
}
|
||||
|
||||
// getRepairFunc returns the repair function based on the component.
|
||||
func getRepairFunc(hco *options.HealthCheckerOptions) func() {
|
||||
// Restart-Service will stop and attempt to start the service
|
||||
return func() {
|
||||
powershell("Restart-Service", hco.Service)
|
||||
}
|
||||
}
|
||||
|
||||
// powershell executes the arguments in powershell process and returns (output, error) from command.
|
||||
func powershell(args ...string) (string, error) {
|
||||
cmd := util.Powershell(args...)
|
||||
return extractCommandOutput(cmd)
|
||||
}
|
||||
|
||||
// Given an executable command, run and return the standard output, or error if command failed.
|
||||
func extractCommandOutput(cmd *exec.Cmd) (string, error) {
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
klog.Infof("command %v failed: %v, %v\n", cmd, err, out)
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(out), "\r\n"), nil
|
||||
}
|
||||
|
||||
// checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last
|
||||
// service restart. (false, nil) otherwise.
|
||||
func checkForPattern(service, logStartTime, logPattern string, logCountThreshold int) (bool, error) {
|
||||
countPatternLogCmd := "@(Get-WinEvent -Logname System | Where-Object {($_.TimeCreated -ge ([datetime]::ParseExact('" + logStartTime +
|
||||
"','" + types.LogParsingTimeFormat + "', $null))) -and ($_.Message -Match '" + logPattern + "')}).count"
|
||||
|
||||
out, err := powershell(countPatternLogCmd)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
occurrences, err := strconv.Atoi(out)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
if occurrences >= logCountThreshold {
|
||||
klog.Infof("%s failed log pattern check, %s occurrences: %v", service, logPattern, occurrences)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getDockerPath() string {
|
||||
return "docker.exe"
|
||||
}
|
|
@ -16,22 +16,139 @@ limitations under the License.
|
|||
|
||||
package types
|
||||
|
||||
import "time"
|
||||
|
||||
const (
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
KubeletComponent = "kubelet"
|
||||
CRIComponent = "cri"
|
||||
DockerComponent = "docker"
|
||||
ContainerdService = "containerd"
|
||||
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() bool
|
||||
const (
|
||||
DefaultLoopBackTime = 0 * time.Minute
|
||||
DefaultCriTimeout = 2 * time.Second
|
||||
DefaultCoolDownTime = 2 * time.Minute
|
||||
DefaultHealthCheckTimeout = 10 * time.Second
|
||||
CmdTimeout = 10 * time.Second
|
||||
LogParsingTimeLayout = "2006-01-02 15:04:05"
|
||||
|
||||
KubeletComponent = "kubelet"
|
||||
CRIComponent = "cri"
|
||||
DockerComponent = "docker"
|
||||
ContainerdService = "containerd"
|
||||
KubeProxyComponent = "kube-proxy"
|
||||
|
||||
LogPatternFlagSeparator = ":"
|
||||
hostAddressKey = "HOST_ADDRESS"
|
||||
kubeletPortKey = "KUBELET_PORT"
|
||||
kubeProxyPortKey = "KUBEPROXY_PORT"
|
||||
|
||||
defaultHostAddress = "localhost"
|
||||
defaultKubeletPort = "10248"
|
||||
defaultKubeproxyPort = "10256"
|
||||
)
|
||||
|
||||
var (
|
||||
kubeletHealthCheckEndpoint string
|
||||
kubeProxyHealthCheckEndpoint string
|
||||
)
|
||||
|
||||
func init() {
|
||||
setKubeEndpoints()
|
||||
}
|
||||
|
||||
func setKubeEndpoints() {
|
||||
var o string
|
||||
|
||||
hostAddress := defaultHostAddress
|
||||
kubeletPort := defaultKubeletPort
|
||||
kubeProxyPort := defaultKubeproxyPort
|
||||
|
||||
o = os.Getenv(hostAddressKey)
|
||||
if o != "" {
|
||||
hostAddress = o
|
||||
}
|
||||
o = os.Getenv(kubeletPortKey)
|
||||
if o != "" {
|
||||
kubeletPort = o
|
||||
}
|
||||
o = os.Getenv(kubeProxyPortKey)
|
||||
if o != "" {
|
||||
kubeProxyPort = o
|
||||
}
|
||||
|
||||
kubeletHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeletPort))
|
||||
kubeProxyHealthCheckEndpoint = fmt.Sprintf("http://%s/healthz", net.JoinHostPort(hostAddress, kubeProxyPort))
|
||||
|
||||
}
|
||||
|
||||
func KubeProxyHealthCheckEndpoint() string {
|
||||
return kubeProxyHealthCheckEndpoint
|
||||
}
|
||||
func KubeletHealthCheckEndpoint() string {
|
||||
return kubeletHealthCheckEndpoint
|
||||
}
|
||||
|
||||
type HealthChecker interface {
|
||||
CheckHealth() (bool, error)
|
||||
}
|
||||
|
||||
// LogPatternFlag defines the flag for log pattern health check.
|
||||
// It contains a map of <log pattern> to <failure threshold for the pattern>
|
||||
type LogPatternFlag struct {
|
||||
logPatternCountMap map[string]int
|
||||
}
|
||||
|
||||
// String implements the String function for flag.Value interface
|
||||
// Returns a space separated sorted by keys string of map values.
|
||||
func (lpf *LogPatternFlag) String() string {
|
||||
result := ""
|
||||
var keys []string
|
||||
for k := range lpf.logPatternCountMap {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, k := range keys {
|
||||
if result != "" {
|
||||
result += " "
|
||||
}
|
||||
result += fmt.Sprintf("%v:%v", k, lpf.logPatternCountMap[k])
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Set implements the Set function for flag.Value interface
|
||||
func (lpf *LogPatternFlag) Set(value string) error {
|
||||
if lpf.logPatternCountMap == nil {
|
||||
lpf.logPatternCountMap = make(map[string]int)
|
||||
}
|
||||
items := strings.Split(value, ",")
|
||||
for _, item := range items {
|
||||
val := strings.SplitN(item, LogPatternFlagSeparator, 2)
|
||||
if len(val) != 2 {
|
||||
return fmt.Errorf("invalid format of the flag value: %v", val)
|
||||
}
|
||||
countThreshold, err := strconv.Atoi(val[0])
|
||||
if err != nil || countThreshold == 0 {
|
||||
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
|
||||
}
|
||||
pattern := val[1]
|
||||
if pattern == "" {
|
||||
return fmt.Errorf("invalid format for the flag value: %v: %v", val, err)
|
||||
}
|
||||
lpf.logPatternCountMap[pattern] = countThreshold
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Type implements the Type function for flag.Value interface
|
||||
func (lpf *LogPatternFlag) Type() string {
|
||||
return "logPatternFlag"
|
||||
}
|
||||
|
||||
// GetLogPatternCountMap returns the stored log count map
|
||||
func (lpf *LogPatternFlag) GetLogPatternCountMap() map[string]int {
|
||||
return lpf.logPatternCountMap
|
||||
}
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestLogPatternFlag(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
value string
|
||||
expectedStringVal string
|
||||
expectedLogPatternCountMap map[string]int
|
||||
expectSetError bool
|
||||
}{
|
||||
{
|
||||
name: "valid single flag value",
|
||||
value: "10:pattern1",
|
||||
expectedStringVal: "pattern1:10",
|
||||
expectedLogPatternCountMap: map[string]int{"pattern1": 10},
|
||||
expectSetError: false,
|
||||
},
|
||||
{
|
||||
name: "valid multiple flag values",
|
||||
value: "10:pattern1,20:pattern2",
|
||||
expectedStringVal: "pattern1:10 pattern2:20",
|
||||
expectedLogPatternCountMap: map[string]int{"pattern1": 10, "pattern2": 20},
|
||||
expectSetError: false,
|
||||
},
|
||||
{
|
||||
name: "empty log pattern",
|
||||
value: "10:",
|
||||
expectSetError: true,
|
||||
},
|
||||
{
|
||||
name: "0 failure threshold count",
|
||||
value: "0:pattern1",
|
||||
expectSetError: true,
|
||||
},
|
||||
{
|
||||
name: "empty failure threshold count",
|
||||
value: ":pattern1",
|
||||
expectSetError: true,
|
||||
},
|
||||
{
|
||||
name: "empty failure threshold count and pattern",
|
||||
value: ":",
|
||||
expectSetError: true,
|
||||
},
|
||||
{
|
||||
name: "non integer value in failure threshold",
|
||||
value: "notAnInteger:pattern1",
|
||||
expectSetError: true,
|
||||
},
|
||||
{
|
||||
name: "valid log pattern with ':'",
|
||||
value: "10:pattern1a:pattern1b,20:pattern2",
|
||||
expectedStringVal: "pattern1a:pattern1b:10 pattern2:20",
|
||||
expectedLogPatternCountMap: map[string]int{"pattern1a:pattern1b": 10, "pattern2": 20},
|
||||
expectSetError: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
flag := LogPatternFlag{}
|
||||
err := flag.Set(test.value)
|
||||
if test.expectSetError {
|
||||
assert.Error(t, err)
|
||||
} else {
|
||||
assert.NoError(t, err)
|
||||
actualStringVal := flag.String()
|
||||
actualLogPatternCountMap := flag.GetLogPatternCountMap()
|
||||
assert.Equal(t, test.expectedStringVal, actualStringVal)
|
||||
if !reflect.DeepEqual(test.expectedLogPatternCountMap, actualLogPatternCountMap) {
|
||||
t.Fatalf("logPatternCountMap mismatch, expected: %v, actual: %v", test.expectedLogPatternCountMap, actualLogPatternCountMap)
|
||||
}
|
||||
assert.Equal(t, test.expectedLogPatternCountMap, actualLogPatternCountMap)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestKubeEndpointConfiguration(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
envConfig map[string]string
|
||||
expectedKubeletEndpoint string
|
||||
expectedKubeProxyEndpoint string
|
||||
}{
|
||||
{
|
||||
name: "no overrides supplied",
|
||||
envConfig: map[string]string{},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv4",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.5.4",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.5.4:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.5.4:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS override supplied with IPv6",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "80:f4:16::1",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://[80:f4:16::1]:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://[80:f4:16::1]:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://localhost:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://localhost:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBELET_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBELET_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:10256/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "samplehost.testdomain.com",
|
||||
"KUBEPROXY_PORT": "12345",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://samplehost.testdomain.com:10248/healthz",
|
||||
expectedKubeProxyEndpoint: "http://samplehost.testdomain.com:12345/healthz",
|
||||
},
|
||||
{
|
||||
name: "HOST_ADDRESS, KUBELET_PORT and KUBEPROXY_PORT override supplied",
|
||||
envConfig: map[string]string{
|
||||
"HOST_ADDRESS": "10.0.10.1",
|
||||
"KUBELET_PORT": "12345",
|
||||
"KUBEPROXY_PORT": "12346",
|
||||
},
|
||||
expectedKubeletEndpoint: "http://10.0.10.1:12345/healthz",
|
||||
expectedKubeProxyEndpoint: "http://10.0.10.1:12346/healthz",
|
||||
},
|
||||
}
|
||||
for _, test := range testCases {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
for key, val := range test.envConfig {
|
||||
t.Setenv(key, val)
|
||||
}
|
||||
setKubeEndpoints()
|
||||
|
||||
kubeProxyHCEndpoint := KubeProxyHealthCheckEndpoint()
|
||||
kubeletHCEndpoint := KubeletHealthCheckEndpoint()
|
||||
|
||||
assert.Equal(t, test.expectedKubeProxyEndpoint, kubeProxyHCEndpoint)
|
||||
assert.Equal(t, test.expectedKubeletEndpoint, kubeletHCEndpoint)
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
//go:build unix
|
||||
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "/usr/bin/crictl"
|
||||
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
|
||||
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 MST"
|
||||
)
|
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package types
|
||||
|
||||
const (
|
||||
DefaultCriCtl = "C:/etc/kubernetes/node/bin/crictl.exe"
|
||||
DefaultCriSocketPath = "npipe:////./pipe/containerd-containerd"
|
||||
UptimeTimeLayout = "Mon 02 Jan 2006 15:04:05 MST"
|
||||
LogParsingTimeFormat = "yyyy-MM-dd HH:mm:ss"
|
||||
)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue