Compare commits
1487 Commits
snapshot-i
...
main
Author | SHA1 | Date |
---|---|---|
|
02afaa5718 | |
|
54d9d4bc58 | |
|
a577a338d3 | |
|
a0bdb76035 | |
|
d28ad8a344 | |
|
6046406304 | |
|
bf991fce7f | |
|
505245fd51 | |
|
077e11a1df | |
|
94f2366ddc | |
|
955bed8477 | |
|
03a4e66825 | |
|
dd1087f737 | |
|
c7d66c5b00 | |
|
5b3ae25309 | |
|
16d138f6d1 | |
|
d15c6ea083 | |
|
04da1bb68e | |
|
a7b51ad7e4 | |
|
fd39336850 | |
|
a5bdf2ce2a | |
|
36e96470f9 | |
|
44c0c10d2f | |
|
c05d82fa41 | |
|
43b99427a8 | |
|
9eaa95607e | |
|
9e2daf6676 | |
|
7004a73682 | |
|
b646a981ee | |
|
4a79bb5d7f | |
|
ff6ad0a6ef | |
|
6921c91e44 | |
|
9b737864d1 | |
|
24f19cd05c | |
|
e07dfcfce2 | |
|
4e0853ec0b | |
|
54dbada2eb | |
|
11ba3c7893 | |
|
d21b9d73ce | |
|
00b3775e9d | |
|
da7920f1a6 | |
|
0d94821cbd | |
|
dc76999565 | |
|
594617d9ed | |
|
54edb616e2 | |
|
453de597ba | |
|
8b80850c7b | |
|
23e5b6552b | |
|
c865c9e104 | |
|
b290e8ba3d | |
|
5d20a4f8ae | |
|
6f2effa486 | |
|
026bc66458 | |
|
5739f698d1 | |
|
cbe99fee8d | |
|
24b1f35dfb | |
|
bec9d0d50a | |
|
ccfb648f6c | |
|
252adb1696 | |
|
be026b5703 | |
|
feea375c99 | |
|
5f37189fcc | |
|
de653f5883 | |
|
8edd2f27a9 | |
|
2e9756e66c | |
|
b338a74882 | |
|
32a96c8945 | |
|
7e6b395358 | |
|
dbd792183a | |
|
6080f7fc3d | |
|
5caf1f7a64 | |
|
83e774b14c | |
|
95536ac499 | |
|
12776854cd | |
|
ebcfab6ec0 | |
|
e4814c2170 | |
|
031f6469c0 | |
|
04f16d8122 | |
|
d0fcf1e712 | |
|
94021d6bba | |
|
f57c7d58a4 | |
|
958d5fc840 | |
|
4813a4f241 | |
|
b2f8b9dcb4 | |
|
80199ab660 | |
|
af86a28d52 | |
|
c9d98e7a74 | |
|
0bbfe1cfbf | |
|
748c37bbef | |
|
73182f9680 | |
|
8f48a396c1 | |
|
a2010804ee | |
|
7faef07d89 | |
|
91b2cac7a9 | |
|
a41efdbbeb | |
|
b112172a05 | |
|
a3275c79d2 | |
|
a22425fba4 | |
|
c082ae80ca | |
|
1cc5c22ad6 | |
|
bd156fc71b | |
|
ac3352b8bd | |
|
1c7b8755ed | |
|
615a97dccf | |
|
73f64b198d | |
|
27b042cf6c | |
|
b89aad5e6a | |
|
f38e8d2150 | |
|
868409fecf | |
|
db23d1a4ca | |
|
c84b0c262d | |
|
ff3a253eac | |
|
456aa1a843 | |
|
de76de0624 | |
|
558d3427d6 | |
|
7628b548fb | |
|
6598e60382 | |
|
5575d62223 | |
|
8c1d45add6 | |
|
a7b17c1f75 | |
|
3e8fc3bec4 | |
|
770e18af70 | |
|
0da6bd181a | |
|
36d5432f95 | |
|
900d48e8d5 | |
|
3707dce917 | |
|
4352bf372c | |
|
fcbd7bc23d | |
|
daac1574b4 | |
|
24a7753d8a | |
|
b8c28f19b8 | |
|
70b19e2408 | |
|
d790240106 | |
|
659d5bdaca | |
|
6332644c56 | |
|
8db45ba082 | |
|
06f648391b | |
|
90ff45de18 | |
|
e3ae306940 | |
|
0366ab1799 | |
|
f2796d585f | |
|
1be46e2086 | |
|
67263e005d | |
|
c958319703 | |
|
d67c93dc0b | |
|
5f6e4038a6 | |
|
dc6f97961d | |
|
8f776b4b60 | |
|
12e0507139 | |
|
c5ba0f20d4 | |
|
b1a7d7c660 | |
|
5dc80ac974 | |
|
db2b1868b8 | |
|
89af6edbbf | |
|
50e256e0ae | |
|
4e7fb0212a | |
|
dfa199d806 | |
|
962c57953a | |
|
fa665fdcfd | |
|
140a3ccccd | |
|
94db6c65c6 | |
|
3ebac9000d | |
|
73c445ec17 | |
|
a2145d1257 | |
|
f239824fd5 | |
|
5f048ae5c0 | |
|
f61f291622 | |
|
9a7ec41c24 | |
|
eb69ce0534 | |
|
adcd64a956 | |
|
973a56a030 | |
|
a3e76df893 | |
|
0b11175896 | |
|
44bff43130 | |
|
09b5e26589 | |
|
ce2a97cfd9 | |
|
e3bc5a7bac | |
|
92ef3b1dbe | |
|
7cd26af24f | |
|
32791b63ce | |
|
a0aca016ee | |
|
6bc04d2e26 | |
|
27d8539c47 | |
|
c46dfe3022 | |
|
1d21b84684 | |
|
f95ce6f9e7 | |
|
129f26d682 | |
|
fcaa5460dd | |
|
b63992f3bc | |
|
1d96354ba7 | |
|
df648c97ec | |
|
e240e2a0ba | |
|
ae82014a81 | |
|
d49057872a | |
|
23b3bbe7b9 | |
|
a50ba39cdd | |
|
7985b43fbc | |
|
2d41d96d58 | |
|
c921b37994 | |
|
e0e1482154 | |
|
54ed55cc0b | |
|
2769273bf7 | |
|
b825276a56 | |
|
98ad7fa7c3 | |
|
bf37437ed9 | |
|
8f386764c8 | |
|
fbbdf0003e | |
|
757727b0ca | |
|
33cdc4bf89 | |
|
cb0f1af2c2 | |
|
123e5050be | |
|
19f5c9b10d | |
|
cf81dc75c9 | |
|
033ecfa21d | |
|
657fc41cc6 | |
|
55a19427ac | |
|
b553aa41bd | |
|
e815ad17b9 | |
|
110f2cb97d | |
|
4416cf7556 | |
|
033649ba41 | |
|
eea96cd758 | |
|
14e6626a39 | |
|
474679d13f | |
|
4fee2e718d | |
|
0983c0df9e | |
|
f5045bc215 | |
|
e20254ee49 | |
|
7b327846b1 | |
|
e4204d193a | |
|
934497c438 | |
|
2631239e7a | |
|
aaa944b7dd | |
|
ba6af6b08e | |
|
7fbfacf689 | |
|
177cb0d836 | |
|
393edae102 | |
|
892f10904a | |
|
87568d3d8e | |
|
9db9528988 | |
|
b8432a7c25 | |
|
dc29d591b2 | |
|
6d4db9a017 | |
|
7a8a8b9b4e | |
|
be155f3fda | |
|
4ed0da4447 | |
|
07fd705397 | |
|
b42667cc3c | |
|
6efa552e28 | |
|
5de3ddd2b3 | |
|
08b0cd74e1 | |
|
e4921fd279 | |
|
56dda5d9a7 | |
|
a912c4c64c | |
|
973426b49d | |
|
2a9b1f3221 | |
|
7867c20db2 | |
|
8d8a5cacc0 | |
|
6ad9d60497 | |
|
c85445796d | |
|
20197657d9 | |
|
c32277a314 | |
|
2157cc0721 | |
|
2a4f4ba654 | |
|
d633195a9d | |
|
fd83dc65df | |
|
8b138fb5ca | |
|
e80719ce6e | |
|
5f8fd31392 | |
|
04eb599d1f | |
|
5fe7237d60 | |
|
090022fa21 | |
|
3b398c428c | |
|
3909bd9e1e | |
|
d447689cef | |
|
1eb1607153 | |
|
a9e7cd5d93 | |
|
2a1f27f4bc | |
|
5f836813d7 | |
|
cdf104814d | |
|
046bddde7b | |
|
bf20b90ede | |
|
cd0dafc892 | |
|
8a99a58f2c | |
|
ca6a70bca1 | |
|
18260dcced | |
|
4e15fb18f1 | |
|
d47902f2a1 | |
|
a91af78d7a | |
|
a25ec4153a | |
|
30dabf8157 | |
|
31ce0d776b | |
|
a14e83cfa3 | |
|
bd8cf8aa80 | |
|
c1c64ac83f | |
|
368fb99571 | |
|
9e0b840358 | |
|
53f7e8c052 | |
|
4457e399b2 | |
|
4bd0a5c0dd | |
|
a8e8a2af84 | |
|
069681f90f | |
|
62aedf0045 | |
|
db434ee3cd | |
|
c10d7eb487 | |
|
b656a0a256 | |
|
5a8d4d03a2 | |
|
138955c916 | |
|
107bc3aa80 | |
|
f99f724e19 | |
|
c0fa5f5ded | |
|
cfec002f9c | |
|
293341b375 | |
|
13eb5797fe | |
|
b661d0dcd2 | |
|
0a11392964 | |
|
15a085e241 | |
|
5f1546f848 | |
|
58c5aa5821 | |
|
a202bf4cfc | |
|
9c06db104c | |
|
e627625eac | |
|
8cc44a4cde | |
|
61e622f3a4 | |
|
1081a625a8 | |
|
eeb36c7a10 | |
|
c6f91061b7 | |
|
4d83be26c7 | |
|
1261d50455 | |
|
bd1635bfed | |
|
21c3a94b27 | |
|
e542e8f81a | |
|
3275d09e34 | |
|
c2da7e5e86 | |
|
eddc9b0d28 | |
|
5dc8f5e497 | |
|
19d7f3ba71 | |
|
14dc4cb0f3 | |
|
6e4c88373f | |
|
592ffe1b95 | |
|
9aec10eacd | |
|
d6fda7a983 | |
|
ea4e85d082 | |
|
a2b1c1a2e0 | |
|
e31cbdcb97 | |
|
10cf1cdd49 | |
|
d695cde6ca | |
|
f02a1adba0 | |
|
a0d33d2733 | |
|
4a6dd8a41c | |
|
795aea6c66 | |
|
ee05e9ee12 | |
|
4fdeb37b4a | |
|
831eb1afd4 | |
|
34432975fc | |
|
654ca19b6f | |
|
d172d8e1b6 | |
|
e2616e18c8 | |
|
032a795c4a | |
|
7620286bfc | |
|
c505877b44 | |
|
85e6f7d5af | |
|
5baf5e4737 | |
|
a9603f9429 | |
|
9f6d430784 | |
|
d9c7331e68 | |
|
2a8e5ffc2d | |
|
fbc7de3af0 | |
|
e9b3deb894 | |
|
ad48ccc2e4 | |
|
041fff3652 | |
|
d191629db6 | |
|
3251d7d40f | |
|
8992fe7b80 | |
|
3ac855f5e9 | |
|
1302eda033 | |
|
514bea8fd2 | |
|
12b4f44b36 | |
|
784ff28e67 | |
|
2af87800e7 | |
|
d7eab2feb1 | |
|
2e42e2a7a0 | |
|
db5a6c358a | |
|
d41a8c7375 | |
|
2041585c25 | |
|
d02f185b5b | |
|
17a8ca4b41 | |
|
57566781f3 | |
|
74de2fb264 | |
|
fe8a990c51 | |
|
039446407b | |
|
092ab705f8 | |
|
5d614ba6e7 | |
|
a7de54a681 | |
|
424f165fe2 | |
|
86ce889e7d | |
|
a5aab799b7 | |
|
2bf1252745 | |
|
801cb82fce | |
|
97b6fb3314 | |
|
e63b9601c5 | |
|
9f633659e9 | |
|
8ba18a1742 | |
|
71439bd91f | |
|
53ef1f148f | |
|
c27efed557 | |
|
2f5b5395d2 | |
|
64dc0899ee | |
|
4db4b43816 | |
|
76290b592c | |
|
1a5ba5f081 | |
|
6a3d9d3812 | |
|
0844f5cb34 | |
|
61446f85db | |
|
b0d8a8c027 | |
|
eb7a9c4b1a | |
|
4eadb72e38 | |
|
bbf459cf08 | |
|
26afb25de3 | |
|
98272b15c6 | |
|
54a3cacef7 | |
|
abc62d63a2 | |
|
f0b3dc1f07 | |
|
2b3bd11a5d | |
|
91b9af19ec | |
|
4a2b892145 | |
|
bc465ee941 | |
|
25ad93c458 | |
|
5dbd8df3a5 | |
|
cf8e6a5a32 | |
|
c75fdea641 | |
|
a2e71dcdd4 | |
|
6e8001854a | |
|
5687f7d7b3 | |
|
1d58ecf3cf | |
|
f62450cf06 | |
|
04d1bb452b | |
|
7efee9aed5 | |
|
611a3c77b9 | |
|
b5be9030b7 | |
|
62cb92b121 | |
|
5afdc70e4d | |
|
520e21cbda | |
|
03902ad3b6 | |
|
110d8e3100 | |
|
3ec60d46d4 | |
|
dff221a515 | |
|
2df10df374 | |
|
fe3ca60069 | |
|
23e9d16053 | |
|
343fc58c9d | |
|
af1b108ca2 | |
|
26b454f558 | |
|
dad1c585d3 | |
|
fafbfa0a26 | |
|
86c21f1e93 | |
|
0cfe707fb6 | |
|
ba99123579 | |
|
01e33b0d06 | |
|
df951ba451 | |
|
2b83af578e | |
|
4046f7f80e | |
|
6017193523 | |
|
5f6f2b48dc | |
|
8a41bde540 | |
|
8576814c9c | |
|
ca3f4af563 | |
|
9a43e7e1ba | |
|
e9296cd689 | |
|
26f35d0d14 | |
|
51b89cefbc | |
|
db6fea1adf | |
|
e0b132d6ea | |
|
4141185fd3 | |
|
fbe399df24 | |
|
b8b16de29b | |
|
10b4d1e658 | |
|
3e6463514d | |
|
c578d06f02 | |
|
cc3bd9abe3 | |
|
096f4edf8e | |
|
80b70c845b | |
|
43a0cd8121 | |
|
473c62f406 | |
|
efc5af9500 | |
|
f2e8630a70 | |
|
d87ac95cd4 | |
|
5fb6d9a130 | |
|
ce798985d0 | |
|
26525a33ba | |
|
cfb84597c4 | |
|
a388cb06c4 | |
|
e1c427bb5a | |
|
05db117851 | |
|
227f9ed2a8 | |
|
284df1f621 | |
|
7f7fdfacb8 | |
|
8cc6d7c716 | |
|
39de80ae96 | |
|
bfae1e551e | |
|
0a32c0be37 | |
|
4202dcf7ae | |
|
68f533e731 | |
|
d6c4717aa0 | |
|
ed4e95e90f | |
|
c1ba109d02 | |
|
f900e9434b | |
|
908d6f7288 | |
|
c8f641bb46 | |
|
78e3e1ebdb | |
|
9ca7e4f144 | |
|
ff44597437 | |
|
607e544231 | |
|
79f600db71 | |
|
643a11807f | |
|
3d96a3a1c4 | |
|
f128f1b091 | |
|
fdf65c8a6b | |
|
95e21cd23d | |
|
56469cb960 | |
|
1b63103313 | |
|
2c929112fc | |
|
c982274eb1 | |
|
f83761d854 | |
|
7862f96034 | |
|
21f8f1871d | |
|
0a778ad951 | |
|
1167e33404 | |
|
e8a25c31e0 | |
|
620c70e800 | |
|
855a183fc6 | |
|
95aedc6752 | |
|
b8ea998fd8 | |
|
1ac3042c56 | |
|
60c1eb25b5 | |
|
2a306d8ece | |
|
11304d3f6b | |
|
64a99efc61 | |
|
720efb5922 | |
|
cb49a8e779 | |
|
f196edd1c3 | |
|
61589375cf | |
|
7ff27ed5ea | |
|
52d46db985 | |
|
4b52baaac7 | |
|
00643688a1 | |
|
32b3eed436 | |
|
ed47ece52b | |
|
bd84155fc9 | |
|
c7b2bf7155 | |
|
d014401904 | |
|
8a8229f5b7 | |
|
929a877bc4 | |
|
6fadfc8331 | |
|
e68fba28f4 | |
|
9c14c3768f | |
|
6a80969d1f | |
|
7997e3f6b8 | |
|
0cee57bca9 | |
|
0a6f82cb00 | |
|
bf617ed833 | |
|
2db8f134af | |
|
ae84df5f43 | |
|
24893979f4 | |
|
8d30160f73 | |
|
a7362a27cc | |
|
44461fb46b | |
|
0cff440caa | |
|
3eb52b4073 | |
|
110ab215f7 | |
|
6b90d4fa91 | |
|
d5add27b0c | |
|
52dfab8232 | |
|
483cfcbb6b | |
|
533e1951d4 | |
|
8ba5a338f5 | |
|
0831fe3e09 | |
|
562b0d2f73 | |
|
081410a3b8 | |
|
3e119b3de9 | |
|
fbf85e5071 | |
|
002b76e7a0 | |
|
604ee184a8 | |
|
d29c78070a | |
|
56db43a063 | |
|
2ec6de853c | |
|
41a1caa41a | |
|
8270414077 | |
|
2765bfd0f7 | |
|
93eea3da67 | |
|
153c2eb022 | |
|
6ac5ce7ffb | |
|
4cefad168c | |
|
5f368a4782 | |
|
fa6fc2c17a | |
|
a2ab6cc1cb | |
|
73cc84061a | |
|
2d6e1c2381 | |
|
f78c86329d | |
|
9af20e6c0e | |
|
1f959ef59d | |
|
2b51ec6a73 | |
|
6a70647de1 | |
|
04ac01d55a | |
|
8c49c7ba3b | |
|
eff9775845 | |
|
d6b6505d41 | |
|
86421242fa | |
|
205176cc0f | |
|
cacfd91ae7 | |
|
2582496021 | |
|
76343482be | |
|
91c9ff2e4a | |
|
18eee9913c | |
|
f4cc9ccc61 | |
|
1f7dea75ef | |
|
235cd46e7c | |
|
58e4365fff | |
|
76132cad69 | |
|
58b1d46ad8 | |
|
f7b7e3142d | |
|
403b0eee7f | |
|
6e8122a344 | |
|
3dbbbdce89 | |
|
824c6c02c5 | |
|
a65abf8438 | |
|
2396da315c | |
|
bc5bdaff49 | |
|
9322f84bda | |
|
733e7871a4 | |
|
395b6c7271 | |
|
c6b59d0950 | |
|
d66d6bfc08 | |
|
e20dc866d6 | |
|
a2b347ba67 | |
|
b4ec908f32 | |
|
6cd17b56e4 | |
|
77ed50249f | |
|
ab033ce1e1 | |
|
4a209438ec | |
|
1c1183f4a9 | |
|
1ea9ecf3ad | |
|
918fe07c88 | |
|
0984eb3369 | |
|
245d2a2db9 | |
|
ad7570dea5 | |
|
25f9f1c593 | |
|
c3fff5d9f5 | |
|
84d39cb604 | |
|
ae1b4a66cd | |
|
8decc12ebd | |
|
c76e6ccab5 | |
|
5d91e08243 | |
|
6d58a75240 | |
|
4a37ee9c15 | |
|
5913444401 | |
|
edac5dbf0e | |
|
32e235b281 | |
|
a9f08fc50b | |
|
0e0531a4b8 | |
|
89c2b0a5df | |
|
b12005039e | |
|
dfd3e59aa2 | |
|
d39e674107 | |
|
2f6867dc9c | |
|
b7188bddaf | |
|
e650cd670f | |
|
61948f1b6d | |
|
9a22462d78 | |
|
d7610df137 | |
|
d327e4320b | |
|
8d764b28e6 | |
|
1bba58be29 | |
|
e14dbbd869 | |
|
54b6425de8 | |
|
fa8aa597f8 | |
|
81db88ac6f | |
|
c2f9a7e223 | |
|
9aa4870f94 | |
|
8de4c037b5 | |
|
7c72661381 | |
|
20e942b0f8 | |
|
b03d69b737 | |
|
7d88af13eb | |
|
7481c9b7d3 | |
|
2c5e0d7760 | |
|
f36b6d2033 | |
|
1ef99a39e4 | |
|
9aeb6fedb3 | |
|
a1f9564d12 | |
|
91296e594a | |
|
40db2615e3 | |
|
baf7b3b914 | |
|
580457fec6 | |
|
adc54024c0 | |
|
fe6ad8b2a0 | |
|
675871d597 | |
|
93d7607603 | |
|
ed2a5d0587 | |
|
b3adfe81db | |
|
7e3a860f64 | |
|
74791a56f4 | |
|
c178e0d308 | |
|
f50579137e | |
|
a667a1b035 | |
|
f69bff7b37 | |
|
a81bdb2287 | |
|
f92c08c0d2 | |
|
f13b975c56 | |
|
7ddcf66ec0 | |
|
a5d61debfa | |
|
4069665a94 | |
|
09c3398d10 | |
|
2c1146e4ca | |
|
4a784446a2 | |
|
574d6059dc | |
|
645e7fde5f | |
|
cdd7ad4dbf | |
|
6997c93727 | |
|
bc77671c08 | |
|
b3045038e6 | |
|
55af61b5cb | |
|
46bf4a05e7 | |
|
84cd4b4d06 | |
|
7ea7b81367 | |
|
10a162f0ed | |
|
95e99e63f9 | |
|
e755bbd170 | |
|
9ac9d9c3e4 | |
|
5c46c3b4c4 | |
|
b00017107e | |
|
c76ae67158 | |
|
374d564a8a | |
|
2ac817fa74 | |
|
538b937fd9 | |
|
c9a2287bbc | |
|
e511e7442e | |
|
a562b558ef | |
|
1f11bb82f8 | |
|
45381c1cda | |
|
d93e913ea3 | |
|
8004715acf | |
|
b77eaa2707 | |
|
94e38c8bad | |
|
9f77ed51dc | |
|
307d6a6855 | |
|
9f1af2971c | |
|
4de12b34be | |
|
68b2c53a41 | |
|
213ffda0e1 | |
|
0d1e34b844 | |
|
2b0300ad35 | |
|
a0e13ae4ed | |
|
0891719964 | |
|
3cb9efd704 | |
|
64334851d1 | |
|
c32f52ffe6 | |
|
4861996975 | |
|
593bee6251 | |
|
99839b87ef | |
|
3e7131d31b | |
|
4e16d9f887 | |
|
51621715cc | |
|
cddba85553 | |
|
b2ac77c4e6 | |
|
3fc2a1a556 | |
|
fd58daf292 | |
|
391d6098ba | |
|
782e82dd90 | |
|
2905c5d5ed | |
|
deb274366d | |
|
512465537a | |
|
59a010a26e | |
|
0ed569a463 | |
|
1a2949c293 | |
|
86ee5f95cb | |
|
586218657d | |
|
0d6fc5a772 | |
|
099b17c9ec | |
|
d2ff6a8799 | |
|
a4aedcaecf | |
|
f2c20ded25 | |
|
dd3b58a1f5 | |
|
09a6259242 | |
|
325d772117 | |
|
01e97b338c | |
|
ee7c99bd8a | |
|
910524352b | |
|
0a8b6e1cc6 | |
|
a4a099352a | |
|
9f933a6dc8 | |
|
9598cafe70 | |
|
8d9ac0282c | |
|
b142fef103 | |
|
d0f0686d5f | |
|
317de55ad0 | |
|
1061ecf923 | |
|
28dd20bdd9 | |
|
f2c19cb078 | |
|
d36b8954bd | |
|
9216674134 | |
|
b89d3f8e43 | |
|
2ec82a73fa | |
|
365e6495ea | |
|
94ab744892 | |
|
7ea8298e44 | |
|
a838239ed8 | |
|
b35220a7e6 | |
|
a77d98baa9 | |
|
8a14b88f38 | |
|
13d2c1474e | |
|
df09f6f4ff | |
|
a0ac9c01ae | |
|
9eba118ebc | |
|
6b40097ef6 | |
|
dddc5b17e4 | |
|
a2a55c9a10 | |
|
695f3ef13e | |
|
4c41404205 | |
|
91141231ae | |
|
91b55656e9 | |
|
a4ef3c633a | |
|
053d2ce2f1 | |
|
201b42b61b | |
|
99b9082204 | |
|
cd0efbd770 | |
|
782b75e69f | |
|
8bc4403ea5 | |
|
5e567e4ad1 | |
|
e45f301f8c | |
|
fe0abbcb4b | |
|
e551bbb6ed | |
|
7ca7eec4d4 | |
|
26e9bb02e9 | |
|
c73fc35170 | |
|
5d9e5d4d76 | |
|
ffc54ba21e | |
|
bae62dfac3 | |
|
7edfd2dfbc | |
|
66ca2de12d | |
|
badd4dff49 | |
|
315f0c1a15 | |
|
68f24949ae | |
|
de863f1264 | |
|
fce0eeb4fc | |
|
ecafee71e4 | |
|
8f3a3825a5 | |
|
95327f6b1c | |
|
d12dfbe9e1 | |
|
b58f31f858 | |
|
30b922987d | |
|
1b76c2578a | |
|
925ea3a88e | |
|
3e17494a58 | |
|
4977f377b8 | |
|
2bbbba0470 | |
|
0fffcc9406 | |
|
bc14430c4d | |
|
317323e37c | |
|
fb754a1006 | |
|
8e1412e97e | |
|
40d6332cbf | |
|
84d10c07a8 | |
|
6592ba06a4 | |
|
312ad1fdc3 | |
|
d2dc91776c | |
|
2ff2951cdf | |
|
8a82d09a95 | |
|
cf99a9a8ef | |
|
6381a4f0bd | |
|
8c9b89e552 | |
|
243154b30e | |
|
54eb913144 | |
|
0883f3674b | |
|
fdd5941e0b | |
|
cb0c7ccfe9 | |
|
2eb093296c | |
|
ab74d92b0e | |
|
821e1ec06e | |
|
f33ff8e4de | |
|
aca1feb57d | |
|
96892c2e19 | |
|
583a832c47 | |
|
e1b5975463 | |
|
eb4d3f5344 | |
|
c9d63a40bf | |
|
89434e6f5a | |
|
f159e1ca2b | |
|
c4e450b751 | |
|
85e2575904 | |
|
615060b433 | |
|
d6b948112a | |
|
02064304bd | |
|
b29e075053 | |
|
160446cc44 | |
|
4e42436cf9 | |
|
9a4b0a3760 | |
|
0903fc7fb8 | |
|
940e63bdf7 | |
|
7d52982597 | |
|
2674de4449 | |
|
68719fc5b2 | |
|
d36d749e26 | |
|
aabc238daa | |
|
0bbf94b095 | |
|
a4d0dbfbc0 | |
|
72a8fa08ec | |
|
ca69c0db59 | |
|
1abf595310 | |
|
e92dbb2acd | |
|
a1933674b8 | |
|
6430bd38b9 | |
|
f6a0411fb6 | |
|
98268683c7 | |
|
98329b37c4 | |
|
5a753599ec | |
|
3cffd95f3c | |
|
5a52a28533 | |
|
c6d2911fa5 | |
|
894a84131b | |
|
a77bb2da05 | |
|
81f947fe0e | |
|
cc749da0e0 | |
|
b745f93288 | |
|
26b1e6d027 | |
|
118b38bc10 | |
|
76ccd588a4 | |
|
0051724ae8 | |
|
97f4ed15e2 | |
|
7710cfed55 | |
|
e84da204a8 | |
|
bb1f00caf0 | |
|
452d17ac45 | |
|
5726d393d6 | |
|
0e82ce37ac | |
|
82d66c90d5 | |
|
58e3c67a79 | |
|
a23b9f7aaf | |
|
5915eb79bd | |
|
9684eee677 | |
|
190b3ab758 | |
|
9626ee04a0 | |
|
457ba3929d | |
|
7d92d7070b | |
|
feb2bf69a6 | |
|
b98817dbfc | |
|
3e43a629b5 | |
|
4dda2bbd58 | |
|
feee93a57a | |
|
16cf851bc7 | |
|
4c577e77a9 | |
|
198b306ae6 | |
|
f79811a1f8 | |
|
3e487619f1 | |
|
6c192d509c | |
|
37c29df70e | |
|
e525077f39 | |
|
ca9d237a8a | |
|
1644ef621d | |
|
a298c87c6f | |
|
4b34a41bfc | |
|
1d83783462 | |
|
c1bc7b9cbd | |
|
d9bf109551 | |
|
713cf68f03 | |
|
481f4c37f5 | |
|
7ccdda1fc4 | |
|
af1172e2e5 | |
|
f4aacafd17 | |
|
f75b99b67d | |
|
a231d31237 | |
|
b4f6102f30 | |
|
ded64365f1 | |
|
ab09c07c6e | |
|
36cf116d9a | |
|
ef3afba28e | |
|
2c1d5c3451 | |
|
c46bfb37eb | |
|
fe22810fec | |
|
2a872badaa | |
|
d3fa0edba6 | |
|
68f2b57fdf | |
|
2fa189f22c | |
|
cb91fcf2e3 | |
|
97db3b90ef | |
|
09f8d94c28 | |
|
03a06251c5 | |
|
d932e33d58 | |
|
cd1f21eeaa | |
|
e5688442a0 | |
|
49973e38f3 | |
|
7ab9c85fc0 | |
|
f267af52cf | |
|
16aab631ef | |
|
1cb21bb0c7 | |
|
9535526969 | |
|
18649e5e08 | |
|
b3e0ed26eb | |
|
f62d80615f | |
|
25f905969d | |
|
b27929558a | |
|
8bf9803f73 | |
|
c8aa6b2a52 | |
|
912378c766 | |
|
43821c01d3 | |
|
3cbacc45bb | |
|
8d079422d6 | |
|
a64c46056d | |
|
0acf816df6 | |
|
d08348cb42 | |
|
9a1d83e2ee | |
|
4eaf034398 | |
|
da55227499 | |
|
19d5adf0fc | |
|
f2d4755845 | |
|
f3ca9edb3c | |
|
41b0d5715c | |
|
5008fa451c | |
|
1b22d00f54 | |
|
971e121099 | |
|
70e601360a | |
|
82cff103d0 | |
|
a7b03bfa50 | |
|
947ccbf55f | |
|
e74b3c9b59 | |
|
a0b1f31001 | |
|
e9776e07fe | |
|
5c1759590c | |
|
c0ae18dc43 | |
|
f8fe6971f4 | |
|
3368907000 | |
|
6c9afd4c54 | |
|
4080daea5a | |
|
55a76f8b8c | |
|
1e1479f417 | |
|
746eb77bae | |
|
d38aa95c9c | |
|
9612a3bd01 | |
|
5de76a7379 | |
|
5e956166ec | |
|
8e466d99cb | |
|
6bae207b8a | |
|
ed93d2c069 | |
|
d81b6f7f75 | |
|
dc366d23fc | |
|
26d53e7546 | |
|
053cb0b946 | |
|
12cb74cde5 | |
|
1ff797a1ea | |
|
0280b64c50 | |
|
24148bd36c | |
|
40c15dce07 | |
|
d8aeb707c5 | |
|
20e58ea932 | |
|
20e9f38b6d | |
|
6ffab4e7bf | |
|
1ec70153d9 | |
|
5f7e56e388 | |
|
99140c971c | |
|
861c091478 | |
|
98ef1df2ff | |
|
81d1b147aa | |
|
28f7f11bc3 | |
|
98f736ab16 | |
|
2676e8fb2f | |
|
b5888de2b4 | |
|
5ac5894898 | |
|
55536ad3a7 | |
|
b9e37a19e1 | |
|
682011d545 | |
|
940ee502e7 | |
|
1600465318 | |
|
4e3f89e4f0 | |
|
8104958973 | |
|
e6252d8b72 | |
|
8d11933f5d | |
|
aac137e110 | |
|
848d5a62e8 | |
|
a77b6f3758 | |
|
d8462fb84a | |
|
c15ff39f26 | |
|
c582aa194e | |
|
2be9712feb | |
|
58750b1966 | |
|
85a297d85a | |
|
35d2d5cf87 | |
|
1f37ecb706 | |
|
18668a8c86 | |
|
50c87d4ccf | |
|
fa576915c6 | |
|
0825ae2f1e | |
|
1daae02d7e | |
|
72ab069a8a | |
|
57b57046c0 | |
|
aab8fda20e | |
|
c17e9a72d8 | |
|
d9f36f77b4 | |
|
a3aedcb304 | |
|
773e8c5c1c | |
|
1ab02f212c | |
|
2c314bd9d4 | |
|
5e84363f9e | |
|
b94d144fad | |
|
45222ebc24 | |
|
0d0ca08d6c | |
|
83b05e5430 | |
|
a5d1eaa8a4 | |
|
0275535512 | |
|
be0c51439c | |
|
d57080d05b | |
|
329d9ca496 | |
|
85749c4b56 | |
|
e3f6ed7988 | |
|
e754ba3d9b | |
|
37f835a219 | |
|
6f9d608d63 | |
|
a6426d8965 | |
|
3240ebc9b7 | |
|
7a435ecbab | |
|
bc9378d6b0 | |
|
79d6224bb4 | |
|
93381ea50d | |
|
8582db9f42 | |
|
b58944b1b7 | |
|
2512910115 | |
|
2c5c1db77a | |
|
9644d050dc | |
|
022feb01da | |
|
677f1c37c1 | |
|
962d1f2c39 | |
|
966c777fdc | |
|
cf077d254f | |
|
7476478913 | |
|
adb21859f0 | |
|
65a55cf822 | |
|
9096abfb43 | |
|
24e67bc87e | |
|
4eb7c1ec61 | |
|
0acd6dfc6d | |
|
52a82a1ad9 | |
|
30c6396d77 | |
|
8259190f95 | |
|
9bb8fc3bed | |
|
36fec3f8dd | |
|
491e3de17c | |
|
e14a48a205 | |
|
dbdbda9c26 | |
|
c6bc60a35c | |
|
b6c64c35b0 | |
|
a2a70ddeab | |
|
b7aa8afaa0 | |
|
78f57af451 | |
|
8400c8bd8e | |
|
de1e957450 | |
|
d634b18b69 | |
|
9ec1aea845 | |
|
3a706c18cb | |
|
37f0fe67df | |
|
a82f0c891e | |
|
99f4349ca1 | |
|
0e922c36bb | |
|
1a69f97ddb | |
|
25c22d9e96 | |
|
c1a46ebcfd | |
|
232745942e | |
|
bbd9b90103 | |
|
aaa4d958fb | |
|
6a4e987d6d | |
|
b7537bff39 | |
|
b77d0d095c | |
|
29d37f2ecb | |
|
d72048649f | |
|
d237b915ab | |
|
9889a375b2 | |
|
6f5f936669 | |
|
13598e8951 | |
|
2e84d7b348 | |
|
e231b2f716 | |
|
bfdd4a0376 | |
|
73578f4f5d | |
|
2a370a4883 | |
|
0cd5a46661 | |
|
86cb17e267 | |
|
3bb1a9a00b | |
|
00faadc24c | |
|
e364467ad6 | |
|
8bc524cc68 | |
|
6ec9eac5ae | |
|
8f610fc6ce | |
|
e96e08f4e4 | |
|
0654be6d8f | |
|
d9d139c999 | |
|
e81ae73c8d | |
|
5812a5719a | |
|
84ebe34900 | |
|
900ff12dfc | |
|
b7467b6aa6 | |
|
0825aaee94 | |
|
4d50f0d837 | |
|
f48e3a2b13 | |
|
963da4eef6 | |
|
62ef316f45 | |
|
7d4d1a1a16 | |
|
f1138a21f9 | |
|
bb66a95c7d | |
|
e0ffd6c10e | |
|
d48fd73d6e | |
|
7e314fdca5 | |
|
0a816af033 | |
|
650be09dea | |
|
801f0f1817 | |
|
f4def706a0 | |
|
cb6d4381b8 | |
|
dc244d0390 | |
|
0b869b43e7 | |
|
c7c14d281b | |
|
8477eac549 | |
|
8179dc17dd | |
|
37a65a78db | |
|
7210930b72 | |
|
57bef92686 | |
|
b037785083 | |
|
f1b4a9c1c5 | |
|
54a492d2b2 | |
|
11d38467ed | |
|
71ff839dec | |
|
c1e9be96ab | |
|
175bdb8743 | |
|
84f6b9b33b | |
|
e006731115 | |
|
e87fe4bb1d | |
|
206c021684 | |
|
b5cc4eea15 | |
|
6f7963f445 | |
|
afee017b5e | |
|
a16cccca41 | |
|
5d978222de | |
|
8e80018ca4 | |
|
06c84938d3 | |
|
b944297503 | |
|
8ee88eb99a | |
|
b719c10b69 | |
|
09cb229572 | |
|
9fd1405187 | |
|
528b815480 | |
|
4b0691116e | |
|
fa5bcf4e89 | |
|
87b0b19cc3 | |
|
4996fdbabd | |
|
1394ecfec4 | |
|
64f3444143 | |
|
8eeebaf235 | |
|
3a22d6f091 | |
|
0630c9df73 | |
|
143d0c94c1 | |
|
c2fb128272 | |
|
34229aa7f7 | |
|
fa4caca143 | |
|
1c5ed275ba | |
|
fce31f3660 | |
|
965205e83f | |
|
9f706ad1fe | |
|
73f52ef241 | |
|
e8170b0949 | |
|
4efd5a0f97 | |
|
69c98fd454 | |
|
2540f5ca9f | |
|
9af327f23c | |
|
4e1a1bd843 | |
|
6b0bf60e34 | |
|
e3d6923c44 | |
|
824aeeb919 | |
|
5df02fe385 | |
|
72a233f2e8 | |
|
127a86e306 | |
|
afc8753c8d | |
|
b05bba596b | |
|
51b32a8885 | |
|
b3771a68e1 | |
|
e79ea79042 | |
|
444891c307 | |
|
afa2e6c071 | |
|
92de3191b9 | |
|
f5672efaba | |
|
75bf51bd11 | |
|
f78b32b582 | |
|
b375acc8c9 | |
|
dc259f737b | |
|
bcd341ba17 | |
|
ae74fa0d3c | |
|
d91bcb303c | |
|
660fe56b4b | |
|
0bffd97d99 | |
|
7fd0f49fd3 | |
|
fc95be23ed | |
|
b5775eed57 | |
|
21d719ff8d | |
|
863412247f | |
|
fb55bda8ac | |
|
98d506aa62 | |
|
d02247e259 | |
|
52f3e1aecb | |
|
6a68041465 | |
|
8201748ee2 | |
|
4e215e3d47 | |
|
32753b7bfc | |
|
bf4fae2242 | |
|
0eed92ca89 | |
|
4745c4b26f | |
|
a2b5f6022f | |
|
418a180117 | |
|
e4a327bf06 | |
|
7710bcd38d | |
|
f3b1b03317 | |
|
8805761b52 | |
|
521904a1a7 | |
|
a13abe3397 | |
|
2f3006f249 | |
|
9a334f39f4 | |
|
0e9de48ee0 | |
|
1308d9c503 | |
|
e6d452c23c | |
|
b620acb5ad | |
|
58ec9c1370 | |
|
5da3b9cebe | |
|
438283a993 | |
|
5f70feb0ff | |
|
ef671cd8a8 | |
|
089dafce93 | |
|
68dd4803ae | |
|
be80b5bc2f | |
|
7d746ef7d5 | |
|
454ccaca6e | |
|
b0e816b4b4 | |
|
2e1de1ee45 | |
|
d4d4abb303 | |
|
fac0bbd910 | |
|
3afa6a4ccb | |
|
c27ba7da3c | |
|
c79474274f | |
|
5fc3cf9fdf | |
|
1df8ce71e9 | |
|
4645cd1ea3 | |
|
78335f7681 | |
|
bfce28e2eb | |
|
b84ec30bbb | |
|
2bdd42a2f3 | |
|
881fa11a77 | |
|
80481646c3 | |
|
05a3c54633 | |
|
f5b37a6418 | |
|
f1bfa64fac | |
|
5dad85e6d4 | |
|
7ab46e4eda | |
|
eb40542bcc | |
|
afc502d316 | |
|
03cbcd6f66 | |
|
ebd40ee004 | |
|
5a9a5bbb53 | |
|
cd57cc42f2 | |
|
b1af1252a3 | |
|
bd96e5f58d | |
|
c2ef10f5fb | |
|
3d83485cc7 | |
|
192fd4685f | |
|
d7ea5325d4 | |
|
4285f9932e | |
|
39d15d2cc9 | |
|
bc6afd6d17 | |
|
04553b2e8a | |
|
06413ccc55 | |
|
26dbe10191 | |
|
f66c834428 | |
|
fb3ceafb44 | |
|
6d2e328be1 | |
|
6237f9f10a | |
|
c0b6bc1b2a | |
|
e91aad0fb7 | |
|
d3b34a1408 | |
|
f403e82b11 | |
|
f47045d83f | |
|
19adb0c2bf | |
|
1e5871fd8f | |
|
fa201f6d76 | |
|
1e1000da01 | |
|
d25d4c54a2 | |
|
291901faf0 | |
|
68a3bd0075 | |
|
c4d31da65b | |
|
40189c0047 | |
|
0fd5ab2b84 | |
|
6ea2882cfa | |
|
36fb6a7cf2 | |
|
c5de75cae7 | |
|
9964b1dc73 | |
|
e3381b68b9 | |
|
c602136bfe | |
|
49ee0db436 | |
|
60910d7309 | |
|
7367702089 | |
|
c06476c5cf | |
|
54a494d701 | |
|
6b104777a0 | |
|
0f89a2b194 | |
|
beed432a80 | |
|
c19924eb8d | |
|
61a4152aac | |
|
eb367ee47c | |
|
c890dfad1b | |
|
4177dd1397 | |
|
6f851d243c | |
|
c69b4c8fea | |
|
dae7c82e14 | |
|
7ae43e791c | |
|
350a056f25 | |
|
367747f347 | |
|
62b992e19a | |
|
21a99b3c43 | |
|
b6b6cd0db8 | |
|
8acfef3ec1 | |
|
5ceb27eda5 | |
|
40c48d6ee9 | |
|
9e578ee7ae | |
|
0390749ca8 | |
|
1a307da81b | |
|
caff01322d | |
|
f39d3972ea | |
|
9369f36dda | |
|
5bd0ad5b86 | |
|
7231d0f119 | |
|
fc6fa0fc15 | |
|
245e8a21be | |
|
3749377bd8 | |
|
8fa7ecbd47 | |
|
d444df95e4 | |
|
75653dcf31 | |
|
83ac8329f1 | |
|
89bc1a658f | |
|
121bbbd1f9 | |
|
5fb76cadc0 | |
|
6b9f460671 | |
|
638ed14d3b | |
|
7d6ae3fba5 | |
|
ddcd58076a | |
|
7daf14c7a4 | |
|
5016ee51de | |
|
8f325240b0 | |
|
dd8142b8a5 | |
|
dfe6e0a37d | |
|
44db488bc3 | |
|
614e129e85 | |
|
cc8fd8152a | |
|
c8426e545d | |
|
fbae77deec | |
|
5de4263a4a | |
|
257dce4b1e | |
|
ddfcf55342 | |
|
fd11466bbd | |
|
56750d7ec3 | |
|
7e2dea2141 | |
|
93fff2fb7b | |
|
d329c7f68a | |
|
f0feb70780 | |
|
8c0e01c199 | |
|
36e798b229 | |
|
f83013d270 | |
|
f6af3078c6 | |
|
ff7c22b4f9 | |
|
5a5c15619c | |
|
3082ef782e | |
|
48a3ae8ffe | |
|
db4a4a5d64 | |
|
020646fa05 | |
|
22d4e93ed6 | |
|
bc55461b3d | |
|
c2dc8eb6a9 | |
|
317feadb76 | |
|
5a06306c3a | |
|
8dd78b873f | |
|
fcd72b1f87 | |
|
19c24ffe4e | |
|
9c88cfb75f | |
|
2f2466878c | |
|
b59930cac1 | |
|
77a71b40dd | |
|
1880648947 | |
|
87cc62f6e4 |
|
@ -21,3 +21,6 @@ insert_final_newline = true
|
|||
|
||||
[Makefile]
|
||||
indent_style = tab
|
||||
|
||||
[OWNERS]
|
||||
indent_size = 2
|
||||
|
|
|
@ -1,10 +1,14 @@
|
|||
aliases:
|
||||
sig-docs-blog-owners: # Approvers for blog content
|
||||
- lmktfy
|
||||
- graz-dev
|
||||
- mrbobbytables
|
||||
- natalisucks
|
||||
- nate-double-u
|
||||
sig-docs-blog-reviewers: # Reviewers for blog content
|
||||
- Gauravpadam
|
||||
- graz-dev
|
||||
- lmktfy
|
||||
- mrbobbytables
|
||||
- natalisucks
|
||||
- nate-double-u
|
||||
|
@ -12,6 +16,7 @@ aliases:
|
|||
- dipesh-rawat
|
||||
- divya-mohan0209
|
||||
- katcosgrove
|
||||
- lmktfy
|
||||
- natalisucks
|
||||
- nate-double-u
|
||||
- reylejano
|
||||
|
@ -54,9 +59,9 @@ aliases:
|
|||
- dipesh-rawat
|
||||
- divya-mohan0209
|
||||
- katcosgrove
|
||||
- lmktfy
|
||||
- natalisucks
|
||||
- nate-double-u
|
||||
- rayandas # RT 1.33 Docs Lead
|
||||
- reylejano
|
||||
- salaxander
|
||||
- tengqm
|
||||
|
@ -64,7 +69,7 @@ aliases:
|
|||
- dipesh-rawat
|
||||
- divya-mohan0209
|
||||
- katcosgrove
|
||||
- kbhawkey
|
||||
- lmktfy
|
||||
- mengjiao-liu
|
||||
- natalisucks
|
||||
- nate-double-u
|
||||
|
@ -73,8 +78,6 @@ aliases:
|
|||
- shannonxtreme
|
||||
- tengqm
|
||||
- windsonsea
|
||||
- Princesso
|
||||
- drewhagen
|
||||
sig-docs-es-owners: # Admins for Spanish content
|
||||
- electrocucaracha
|
||||
- krol3
|
||||
|
@ -101,15 +104,18 @@ aliases:
|
|||
- bishal7679
|
||||
- dipesh-rawat
|
||||
- divya-mohan0209
|
||||
- jayeshmahajan
|
||||
- niranjandarshann
|
||||
sig-docs-id-owners: # Admins for Indonesian content
|
||||
- ariscahyadi
|
||||
- girikuncoro
|
||||
- habibrosyad
|
||||
- za
|
||||
sig-docs-id-reviews: # PR reviews for Indonesian content
|
||||
- ariscahyadi
|
||||
- girikuncoro
|
||||
- habibrosyad
|
||||
- za
|
||||
sig-docs-it-owners: # Admins for Italian content
|
||||
- fabriziopandini
|
||||
- Fale
|
||||
|
@ -209,9 +215,11 @@ aliases:
|
|||
- mfilocha
|
||||
- nvtkaszpir
|
||||
sig-docs-uk-owners: # Admins for Ukrainian content
|
||||
- Andygol
|
||||
- Arhell
|
||||
- MaxymVlasov
|
||||
sig-docs-uk-reviews: # PR reviews for Ukrainian content
|
||||
- Andygol
|
||||
- Arhell
|
||||
- idvoretskyi
|
||||
- MaxymVlasov
|
||||
|
|
|
@ -4,13 +4,12 @@
|
|||
# reviewers to review and approve.
|
||||
# Teams and members are visible at https://github.com/orgs/kubernetes/teams.
|
||||
|
||||
reviewers:
|
||||
- sig-docs-en-reviews
|
||||
|
||||
approvers:
|
||||
- sig-docs-en-owners
|
||||
|
||||
filters:
|
||||
".*":
|
||||
reviewers:
|
||||
- sig-docs-en-reviews
|
||||
approvers:
|
||||
- sig-docs-en-owners
|
||||
"\\.svg":
|
||||
labels:
|
||||
- area/web-development
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
let splitInstance = null;
|
||||
|
||||
function enableSplitter(mediaQuery) {
|
||||
if (mediaQuery.matches) {
|
||||
if (!splitInstance) {
|
||||
splitInstance = Split(["#sidebarnav", "#maindoc"], {
|
||||
sizes: [20, 80],
|
||||
minSize: 100,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
if (splitInstance) {
|
||||
splitInstance.destroy();
|
||||
splitInstance = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const screenWidthMediaQuery = window.matchMedia("(min-width: 768px)");
|
||||
|
||||
const eleNav = document.getElementById("sidebarnav");
|
||||
if (eleNav !== null) {
|
||||
enableSplitter(screenWidthMediaQuery);
|
||||
screenWidthMediaQuery.addListener(enableSplitter);
|
||||
}
|
|
@ -11,9 +11,6 @@ $quickstart-button-padding: 0 50px;
|
|||
$vendor-strip-height: 88px;
|
||||
$vendor-strip-font-size: 16px;
|
||||
|
||||
// video
|
||||
$video-section-height: 200px;
|
||||
|
||||
@import "size";
|
||||
@import "documentation";
|
||||
|
||||
|
@ -256,9 +253,6 @@ $ocean-nodes-padding-Y: 60px;
|
|||
$ocean-nodes-main-margin-bottom: 60px;
|
||||
$ocean-nodes-h3-margin-bottom: 30px;
|
||||
|
||||
// video
|
||||
$video-section-height: 200px;
|
||||
|
||||
// Home-specific
|
||||
|
||||
.header-hero {
|
||||
|
@ -317,13 +311,10 @@ $video-section-height: 200px;
|
|||
}
|
||||
|
||||
// Video thingy
|
||||
#video {
|
||||
height: $video-section-height;
|
||||
}
|
||||
|
||||
#video {
|
||||
width: 100%;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
background-position: center center;
|
||||
background-size: cover;
|
||||
|
||||
|
@ -426,6 +417,10 @@ $video-section-height: 200px;
|
|||
}
|
||||
}
|
||||
|
||||
#video:has(#desktopKCButton) {
|
||||
height: 580px;
|
||||
}
|
||||
|
||||
#videoPlayer {
|
||||
@include fullScreen;
|
||||
background-color: rgba(0, 0, 0, 0.9);
|
||||
|
|
|
@ -50,35 +50,6 @@ body {
|
|||
}
|
||||
}
|
||||
|
||||
/* Gutter for sidebar splitter */
|
||||
.gutter {
|
||||
background-color: #eee;
|
||||
background-repeat: no-repeat;
|
||||
background-position: 50%;
|
||||
}
|
||||
|
||||
.gutter.gutter-horizontal {
|
||||
background-image: url('');
|
||||
cursor: col-resize;
|
||||
}
|
||||
|
||||
#sidebarnav,
|
||||
#maindoc {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
#maindoc {
|
||||
overflow-wrap: break-word;
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
#sidebarnav {
|
||||
padding-left: 15px;
|
||||
padding-right: 15px;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Complex table layout support */
|
||||
|
||||
.td-content, body.td-content {
|
||||
|
@ -274,7 +245,7 @@ footer {
|
|||
}
|
||||
|
||||
// Custom footer sizing
|
||||
@media (min-width: 800px) and (max-width: 1279px) {
|
||||
@media (min-width: 800px) and (max-width: 1285px) {
|
||||
footer {
|
||||
ul.footer-icons {
|
||||
min-width: 17.5vw;
|
||||
|
@ -282,6 +253,11 @@ footer {
|
|||
flex-wrap: nowrap;
|
||||
flex-direction: row;
|
||||
justify-content: space-evenly;
|
||||
|
||||
li.mx-2 {
|
||||
margin-left: 0.3rem !important;
|
||||
margin-right: 0.3rem !important;
|
||||
}
|
||||
}
|
||||
.col-sm-2 {
|
||||
flex: 0 0 22.5%;
|
||||
|
@ -1376,42 +1352,6 @@ body.cid-code-of-conduct main {
|
|||
}
|
||||
}
|
||||
|
||||
// search & sidebar
|
||||
.td-sidebar {
|
||||
@media only screen and (min-width: 768px) {
|
||||
padding-top: 1.5rem !important;
|
||||
|
||||
.td-sidebar__inner {
|
||||
top: 8.5rem;
|
||||
|
||||
@media only screen and (min-width: 1075px) {
|
||||
top: 6.5rem;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.td-sidebar-nav {
|
||||
& > .td-sidebar-nav__section {
|
||||
padding-top: .5rem;
|
||||
padding-left: 1.5rem;
|
||||
}
|
||||
}
|
||||
|
||||
.td-sidebar__inner {
|
||||
form.td-sidebar__search {
|
||||
|
||||
.td-sidebar__toggle {
|
||||
&:hover {
|
||||
color: #000000;
|
||||
}
|
||||
|
||||
color: $primary;
|
||||
margin: 1rem;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.no-underline {
|
||||
text-decoration: none !important;
|
||||
}
|
||||
|
@ -1420,16 +1360,6 @@ body.cid-code-of-conduct main {
|
|||
display: none !important;
|
||||
}
|
||||
|
||||
.td-sidebar-link__page {
|
||||
&#m-docs-search {
|
||||
display: none;
|
||||
}
|
||||
|
||||
&#m-docs-test {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
//Tutorials
|
||||
main.content {
|
||||
position: inherit;
|
||||
|
@ -1457,6 +1387,13 @@ main.content {
|
|||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
/* CAREERS */
|
||||
|
||||
// Set 14px font size for GitJobs attribution text
|
||||
.gitjobs-legend {
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
/* CASE-STUDIES */
|
||||
|
||||
// Many of the case studies have small variations in markup and styles;
|
||||
|
@ -1923,6 +1860,58 @@ body.td-search {
|
|||
color: #ffffff !important;
|
||||
}
|
||||
|
||||
body.td-home section.case-studies {
|
||||
h2, h3 {
|
||||
text-align: center;
|
||||
}
|
||||
.case-study-list {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
max-width: 80vw;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
align-items: stretch;
|
||||
gap: clamp(1rem, 4em, 10vw);
|
||||
> .case-study-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
text-align: center;
|
||||
width: clamp(6rem, 20%, 50vw);
|
||||
picture, picture img {
|
||||
height: 4.8rem;
|
||||
text-align: center;
|
||||
}
|
||||
> a {
|
||||
display: block;
|
||||
text-align: right;
|
||||
}
|
||||
}
|
||||
padding-bottom: 2em;
|
||||
}
|
||||
padding-top: 4rem;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 768px) {
|
||||
.case-study-list {
|
||||
justify-content: center;
|
||||
flex-wrap: wrap;
|
||||
> .case-study-item {
|
||||
min-width: 34vw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@media screen and (max-width: 650px) {
|
||||
.case-study-list {
|
||||
> .case-study-item {
|
||||
min-width: 51vw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// handle main page features on narrow viewports
|
||||
@media screen and (max-width: 768px) {
|
||||
.features-container div.feature-box {
|
||||
|
@ -1953,4 +1942,4 @@ section.k8s-birthday-override:has(div.k8s-birthday-override.revert-to-previous i
|
|||
@extend .table;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
$main-max-width: 1200px;
|
||||
$vendor-strip-height: 44px;
|
||||
$video-section-height: 580px;
|
||||
|
||||
@media screen and (min-width: 1024px) {
|
||||
|
||||
|
@ -50,13 +49,12 @@ $video-section-height: 580px;
|
|||
}
|
||||
|
||||
#video {
|
||||
height: $video-section-height;
|
||||
position: relative;
|
||||
background-position: center center;
|
||||
background-position: top center;
|
||||
background-size: cover;
|
||||
|
||||
&>.light-text {
|
||||
margin-right: 10%;
|
||||
margin: 0 10% 15px 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
.td-sidebar-nav {
|
||||
.td-sidebar-link.tree-root {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#navbarDropdownMenuLink {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
/* Gutter for sidebar splitter */
|
||||
.gutter {
|
||||
background-color: #eee;
|
||||
background-repeat: no-repeat;
|
||||
background-position: 50%;
|
||||
|
||||
&.gutter-horizontal {
|
||||
background-image: url('');
|
||||
cursor: col-resize;
|
||||
}
|
||||
}
|
||||
|
||||
#sidebarnav,
|
||||
#maindoc {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
#maindoc {
|
||||
overflow-wrap: break-word;
|
||||
}
|
||||
|
||||
@include media-breakpoint-down(sm) {
|
||||
#sidebarnav {
|
||||
padding-left: 15px;
|
||||
padding-right: 15px;
|
||||
}
|
||||
}
|
||||
|
||||
// search & sidebar
|
||||
.td-sidebar {
|
||||
@include media-breakpoint-up(sm){
|
||||
padding-top: 1.5rem !important;
|
||||
|
||||
.td-sidebar__inner {
|
||||
top: 8.5rem;
|
||||
|
||||
@media only screen and (min-width: 1075px) {
|
||||
top: 6.5rem;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.td-sidebar-nav {
|
||||
& > .td-sidebar-nav__section {
|
||||
padding-top: .5rem;
|
||||
padding-left: 1.5rem;
|
||||
}
|
||||
}
|
||||
|
||||
.td-sidebar__inner form.td-sidebar__search {
|
||||
.td-sidebar__toggle {
|
||||
&:hover {
|
||||
color: #000000;
|
||||
}
|
||||
|
||||
color: $primary;
|
||||
}
|
||||
}
|
||||
|
||||
.td-sidebar-link__page {
|
||||
&#m-docs-test {
|
||||
display: none;
|
||||
}
|
||||
}
|
|
@ -9,6 +9,7 @@ Add styles or import other files. */
|
|||
// Base styles
|
||||
@import "k8s_community";
|
||||
@import "k8s_nav";
|
||||
@import "k8s_sidebar-tree";
|
||||
|
||||
//Media queries
|
||||
@import "base";
|
||||
|
|
|
@ -7,9 +7,6 @@ $headline-wrapper-margin-bottom: 40px;
|
|||
$quickstart-button-padding: 0 50px;
|
||||
$vendor-strip-font-size: 16px;
|
||||
|
||||
//video
|
||||
$video-section-height: 400px;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -113,15 +110,13 @@ $video-section-height: 400px;
|
|||
}
|
||||
|
||||
#video {
|
||||
height: $video-section-height;
|
||||
display: block;
|
||||
height: 550px;
|
||||
|
||||
& > .light-text {
|
||||
display: block;
|
||||
float: right;
|
||||
text-align: left;
|
||||
margin-right: 5%;
|
||||
margin: 0 5% 15px 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -43,17 +43,11 @@ Google সপ্তাহে বিলিয়ন কন্টেইনার
|
|||
<h2>150+ মাইক্রোসার্ভিস কুবারনেটিসে স্থানান্তরিত করার চ্যালেঞ্জ</h2>
|
||||
<p>সারাহ ওয়েলস দ্বারা, অপারেশনস এবং নির্ভরযোগ্যতার জন্য প্রযুক্তিগত পরিচালক, ফিনান্সিয়াল টাইমস</p>
|
||||
<button id="desktopShowVideoButton" onclick="kub.showVideo()">ভিডিও দেখুন</button>
|
||||
<br>
|
||||
<br>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" button id="desktopKCButton">12-15 নভেম্বর KubeCon + CloudNativeCon North America তে যোগ দিন</a>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" button id="desktopKCButton">11-12 ডিসেম্বর KubeCon + CloudNativeCon India তে যোগ দিন</a>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" button id="desktopKCButton">1-4 এপ্রিল, 2025-এ KubeCon + CloudNativeCon Europe তে যোগ দিন</a>
|
||||
|
||||
<h3>আসন্ন KubeCon + CloudNativeCon ইভেন্টগুলিতে যোগ দিন</h3>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mar 23-26, 2026)</a>
|
||||
</div>
|
||||
<div id="videoPlayer">
|
||||
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>
|
||||
|
@ -64,3 +58,5 @@ Google সপ্তাহে বিলিয়ন কন্টেইনার
|
|||
{{< blocks/kubernetes-features >}}
|
||||
|
||||
{{< blocks/case-studies >}}
|
||||
|
||||
{{< kubeweekly id="kubeweekly" >}}
|
||||
|
|
|
@ -27,7 +27,7 @@ case_study_details:
|
|||
<p>"Every single product, every decision we make at Ancestry, focuses on delighting our customers with intimate, sometimes life-changing discoveries about themselves and their families," says MacKay. "As the company continues to grow, the increased productivity gains from using Kubernetes has helped Ancestry make customer discoveries faster. With the move to Dockerization for example, instead of taking between 20 to 50 minutes to deploy a new piece of code, we can now deploy in under a minute for much of our code. We've truly experienced significant time savings in addition to the various features and benefits from cloud native and Kubernetes-type technologies."</p>
|
||||
|
||||
{{< case-studies/quote author="PAUL MACKAY, SOFTWARE ENGINEER AND ARCHITECT AT ANCESTRY" >}}
|
||||
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
|
||||
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
|
||||
{{< /case-studies/quote >}}
|
||||
|
||||
{{< case-studies/lead >}}
|
||||
|
@ -48,7 +48,7 @@ It started with a Shaky Leaf.
|
|||
|
||||
<p>That need led them in 2015 to explore containerization. Ancestry engineers had already been using technology like <a href="https://www.java.com/en/">Java</a> and <a href="https://www.python.org">Python</a> on Linux, so part of the decision was about making the infrastructure more Linux-friendly. They quickly decided that they wanted to go with Docker for containerization, "but it always comes down to the orchestration part of it to make it really work," says MacKay.</p>
|
||||
|
||||
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="http://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
|
||||
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="https://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
|
||||
|
||||
{{< case-studies/lead >}}
|
||||
Plus, MacKay says, "I just believed in the confidence that comes with the history that Google has with containerization. So we started out right on the leading edge of it. And we haven't looked back since."
|
||||
|
|
|
@ -42,9 +42,9 @@ With its end-to-end commerce platform for cloud-based products and services, <a
|
|||
|
||||
<p>When Director of Software Development Pierre-Alexandre Lacerte started working there in 2014, the company had a monolith application deployed on a "tomcat infrastructure, and the whole release process was complex for what it should be," he says. "There were a lot of manual steps involved, with one engineer building a feature then creating a pull request, and a QA or another engineer validating the feature. Then it gets merged and someone else will take care of the deployment. So we had bottlenecks in the pipeline to ship a feature to production."</p>
|
||||
|
||||
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="http://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
|
||||
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="https://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
|
||||
|
||||
{{< case-studies/quote
|
||||
{{< case-studies/quote
|
||||
image="/images/case-studies/appdirect/banner3.jpg"
|
||||
author="Alexandre Gervais, Staff Software Developer, AppDirect"
|
||||
>}}
|
||||
|
@ -61,7 +61,7 @@ With its end-to-end commerce platform for cloud-based products and services, <a
|
|||
|
||||
<p>Lacerte's strategy ultimately worked because of the very real impact the Kubernetes platform has had to deployment time. Due to less dependency on custom-made, brittle shell scripts with SCP commands, time to deploy a new version has shrunk from 4 hours to a few minutes. Additionally, the company invested a lot of effort to make things self-service for developers. "Onboarding a new service doesn't require <a href="https://www.atlassian.com/software/jira">Jira</a> tickets or meeting with three different teams," says Lacerte. Today, the company sees 1,600 deployments per week, compared to 1-30 before.</p>
|
||||
|
||||
{{< case-studies/quote
|
||||
{{< case-studies/quote
|
||||
image="/images/case-studies/appdirect/banner4.jpg"
|
||||
author="Pierre-Alexandre Lacerte, Director of Software Development, AppDirect"
|
||||
>}}
|
||||
|
|
|
@ -20,7 +20,7 @@ case_study_details:
|
|||
|
||||
<h2>Solution</h2>
|
||||
|
||||
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="http://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
|
||||
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="https://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
|
||||
|
||||
<h2>Impact</h2>
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ case_study_details:
|
|||
|
||||
<h2>Solution</h2>
|
||||
|
||||
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="http://kubernetes.io/">Kubernetes.</a></p>
|
||||
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="https://kubernetes.io/">Kubernetes.</a></p>
|
||||
|
||||
<h2>Impact</h2>
|
||||
|
||||
|
@ -50,7 +50,7 @@ It's not every day that you can say you've slashed an operating expense by half.
|
|||
|
||||
<p>GolfNow's dev team ran an "internal, low-key" proof of concept and were won over. "We really liked how easy it was to be able to pass containers around to each other and have them up and running in no time, exactly the way it was running on my machine," says Sheriff. "Because that is always the biggest gripe that Ops has with developers, right? 'It worked on my machine!' But then we started getting to the point of, 'How do we make sure that these things stay up and running?'"</p>
|
||||
|
||||
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="http://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
|
||||
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="https://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
|
||||
|
||||
<p>But before they could go with Kubernetes, <a href="http://www.nbc.com/">NBC</a>, GolfNow's parent company, also asked them to comparison shop with another company. Sheriff and his team liked the competing company's platform user interface, but didn't like that its platform would not allow containers to run natively on Docker. With no clear decision in sight, Sheriff's VP at GolfNow, Steve McElwee, set up a three-month trial during which a GolfNow team (consisting of Sheriff and Josh, who's now Lead Architect, Open Platforms) would build out a Kubernetes environment, and a large NBC team would build out one with the other company's platform.</p>
|
||||
|
||||
|
|
|
@ -355,7 +355,7 @@ kubelet স্বয়ংক্রিয়ভাবে প্রতিটি
|
|||
আপনার কাছে [সাইডকার কন্টেইনার](/bn/docs/concepts/workloads/pods/sidecar-containers/) থাকতে পারে
|
||||
যেগুলি প্রধান অ্যাপ্লিকেশন পডকে সহায়ক পরিষেবা প্রদান করে (উদাহরণস্বরূপ: একটি পরিষেবা মেশ)।
|
||||
|
||||
{{< feature-state for_k8s_version="v1.29" state="beta" >}}
|
||||
{{< feature-state feature_gate_name="SidecarContainers" >}}
|
||||
|
||||
ডিফল্টরূপে সক্রিয় করা হয়েছে, `SidecarContainers` [ফিচার গেট](/bn/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
init কন্টেইনারগুলির জন্য আপনাকে `restartPolicy: Always` নির্দিষ্ট করতে দেয়।
|
||||
|
|
|
@ -15,7 +15,7 @@ Bash-এর জন্য kubectl কমপ্লিশন স্ক্রিপ
|
|||
কিন্তু, kubectl কমপ্লিসন স্ক্রিপ্ট নির্ভর করে [**bash-completion**](https://github.com/scop/bash-completion) যা আপনাকে আগে ইনস্টল করতে হবে।
|
||||
|
||||
{{< warning>}}
|
||||
bash-completion এর দুটি সংস্করণ আছে, v1 এবং v2। V1 Bash 3.2 এর জন্য (যা macOS-এ ডিফল্ট), এবং v2 হল Bash 4.1+ এর জন্য। kubectl পূর্ণতা স্ক্রিপ্ট ** কাজ করে না** সঠিকভাবে bash-completion v1 এবং Bash 3.2 এর সাথে। এর জন্য **ব্যাশ-সম্পূর্ণ v2** এবং **ব্যাশ 4.1+** প্রয়োজন। সুতরাং, macOS-এ kubectl সমাপ্তি সঠিকভাবে ব্যবহার করতে সক্ষম হতে, আপনাকে Bash 4.1+ ([*instructions*](https://itnext.io/upgrading-bash-on-macos-7138bd1066ba)) ইনস্টল এবং ব্যবহার করতে হবে। নিম্নলিখিত নির্দেশাবলী অনুমান করে যে আপনি Bash 4.1+ ব্যবহার করেন (অর্থাৎ, 4.1 বা তার পরবর্তী যেকোনো Bash সংস্করণ)।
|
||||
bash-completion এর দুটি সংস্করণ আছে, v1 এবং v2। V1 Bash 3.2 এর জন্য (যা macOS-এ ডিফল্ট), এবং v2 হল Bash 4.1+ এর জন্য। kubectl পূর্ণতা স্ক্রিপ্ট ** কাজ করে না** সঠিকভাবে bash-completion v1 এবং Bash 3.2 এর সাথে। এর জন্য **ব্যাশ-সম্পূর্ণ v2** এবং **ব্যাশ 4.1+** প্রয়োজন। সুতরাং, macOS-এ kubectl সমাপ্তি সঠিকভাবে ব্যবহার করতে সক্ষম হতে, আপনাকে Bash 4.1+ ([*instructions*](https://apple.stackexchange.com/a/292760)) ইনস্টল এবং ব্যবহার করতে হবে। নিম্নলিখিত নির্দেশাবলী অনুমান করে যে আপনি Bash 4.1+ ব্যবহার করেন (অর্থাৎ, 4.1 বা তার পরবর্তী যেকোনো Bash সংস্করণ)।
|
||||
{{< /warning >}}
|
||||
|
||||
### Bash আপগ্রেড করুন
|
||||
|
|
|
@ -51,7 +51,7 @@ metadata:
|
|||
namespace: kube-system
|
||||
data:
|
||||
my-scheduler-config.yaml: |
|
||||
apiVersion: kubescheduler.config.k8s.io/v1beta2
|
||||
apiVersion: kubescheduler.config.k8s.io/v1
|
||||
kind: KubeSchedulerConfiguration
|
||||
profiles:
|
||||
- schedulerName: my-scheduler
|
||||
|
|
|
@ -25,7 +25,7 @@ spec:
|
|||
app: mysql
|
||||
spec:
|
||||
containers:
|
||||
- image: mysql:5.6
|
||||
- image: mysql:9
|
||||
name: mysql
|
||||
env:
|
||||
# Use secret in real usage
|
||||
|
|
|
@ -6,23 +6,6 @@ type: kubernetes.io/tls
|
|||
data:
|
||||
# values are base64 encoded, which obscures them but does NOT provide
|
||||
# any useful level of confidentiality
|
||||
tls.crt: |
|
||||
LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUNVakNDQWJzQ0FnMytNQTBHQ1NxR1NJYjNE
|
||||
UUVCQlFVQU1JR2JNUXN3Q1FZRFZRUUdFd0pLVURFT01Bd0cKQTFVRUNCTUZWRzlyZVc4eEVEQU9C
|
||||
Z05WQkFjVEIwTm9kVzh0YTNVeEVUQVBCZ05WQkFvVENFWnlZVzVyTkVSRQpNUmd3RmdZRFZRUUxF
|
||||
dzlYWldKRFpYSjBJRk4xY0hCdmNuUXhHREFXQmdOVkJBTVREMFp5WVc1ck5FUkVJRmRsCllpQkRR
|
||||
VEVqTUNFR0NTcUdTSWIzRFFFSkFSWVVjM1Z3Y0c5eWRFQm1jbUZ1YXpSa1pDNWpiMjB3SGhjTk1U
|
||||
TXcKTVRFeE1EUTFNVE01V2hjTk1UZ3dNVEV3TURRMU1UTTVXakJMTVFzd0NRWURWUVFHREFKS1VE
|
||||
RVBNQTBHQTFVRQpDQXdHWEZSdmEzbHZNUkV3RHdZRFZRUUtEQWhHY21GdWF6UkVSREVZTUJZR0Ex
|
||||
VUVBd3dQZDNkM0xtVjRZVzF3CmJHVXVZMjl0TUlHYU1BMEdDU3FHU0liM0RRRUJBUVVBQTRHSUFE
|
||||
Q0JoQUo5WThFaUhmeHhNL25PbjJTbkkxWHgKRHdPdEJEVDFKRjBReTliMVlKanV2YjdjaTEwZjVN
|
||||
Vm1UQllqMUZTVWZNOU1vejJDVVFZdW4yRFljV29IcFA4ZQpqSG1BUFVrNVd5cDJRN1ArMjh1bklI
|
||||
QkphVGZlQ09PekZSUFY2MEdTWWUzNmFScG04L3dVVm16eGFLOGtCOWVaCmhPN3F1TjdtSWQxL2pW
|
||||
cTNKODhDQXdFQUFUQU5CZ2txaGtpRzl3MEJBUVVGQUFPQmdRQU1meTQzeE15OHh3QTUKVjF2T2NS
|
||||
OEtyNWNaSXdtbFhCUU8xeFEzazlxSGtyNFlUY1JxTVQ5WjVKTm1rWHYxK2VSaGcwTi9WMW5NUTRZ
|
||||
RgpnWXcxbnlESnBnOTduZUV4VzQyeXVlMFlHSDYyV1hYUUhyOVNVREgrRlowVnQvRGZsdklVTWRj
|
||||
UUFEZjM4aU9zCjlQbG1kb3YrcE0vNCs5a1h5aDhSUEkzZXZ6OS9NQT09Ci0tLS0tRU5EIENFUlRJ
|
||||
RklDQVRFLS0tLS0K
|
||||
# In this example, the key data is not a real PEM-encoded private key
|
||||
tls.key: |
|
||||
RXhhbXBsZSBkYXRhIGZvciB0aGUgVExTIGNydCBmaWVsZA==
|
||||
# Note: Replace the following values with your own base64-encoded certificate and key.
|
||||
tls.crt: "REPLACE_WITH_BASE64_CERT"
|
||||
tls.key: "REPLACE_WITH_BASE64_KEY"
|
|
@ -85,7 +85,7 @@ type: docs
|
|||
|
||||
GitHub অ্যাক্সেস নিয়ন্ত্রণ: [@kubernetes/release-managers](https://github.com/orgs/kubernetes/teams/release-managers)
|
||||
|
||||
GitHub উল্লেখ: [@kubernetes/release-engineering](https://github.com/orgs/kubernetes/teams/release-engineering)
|
||||
GitHub উল্লেখ: @kubernetes/release-engineering
|
||||
|
||||
- Adolfo García Veytia ([@puerco](https://github.com/puerco))
|
||||
- Cici Huang ([@cici37](https://github.com/cici37))
|
||||
|
|
|
@ -41,11 +41,11 @@ Kubernetes ist Open Source und bietet Dir die Freiheit, die Infrastruktur vor Or
|
|||
<button id="desktopShowVideoButton" onclick="kub.showVideo()">Video ansehen</button>
|
||||
|
||||
<h3>Nehmen Sie an der kommenden KubeCon + CloudNativeCon teil</h3>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" class="desktopKCButton"><strong>Europe</strong> (London, Apr 1-4)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-china/" class="desktopKCButton"><strong>China</strong> (Hongkong, Jun 10-11)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-japan/" class="desktopKCButton"><strong>Japan</strong> (Tokio, Jun 16-17)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america-2025/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mrz 23-26, 2026)</a>
|
||||
</div>
|
||||
<div id="videoPlayer">
|
||||
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>
|
||||
|
|
|
@ -345,7 +345,7 @@ Beide Ansätze sind gleichwertig. Nach dem erneuten Laden der Shell sollte kubec
|
|||
{{% tab name="Bash auf macOS" %}}
|
||||
|
||||
{{< warning>}}
|
||||
macOS beinhaltet standardmäßig Bash 3.2. Das kubectl-Vervollständigunsskript erfordert Bash 4.1+ und funktioniert nicht mit Bash 3.2. Um dies zu umgehen, können Sie eine neuere Version von Bash unter macOS installieren (folgen Sie den Anweisungen [hier](https://itnext.io/upgrading-bash-on-macos-7138bd1066ba)). Die folgenden Anweisungen funktionieren nur, wenn Sie Bash 4.1 oder höher verwenden.
|
||||
macOS beinhaltet standardmäßig Bash 3.2. Das kubectl-Vervollständigunsskript erfordert Bash 4.1+ und funktioniert nicht mit Bash 3.2. Um dies zu umgehen, können Sie eine neuere Version von Bash unter macOS installieren (folgen Sie den Anweisungen [hier](https://apple.stackexchange.com/a/292760)). Die folgenden Anweisungen funktionieren nur, wenn Sie Bash 4.1 oder höher verwenden.
|
||||
{{< /warning >}}
|
||||
|
||||
### Einführung
|
||||
|
|
|
@ -45,11 +45,9 @@ To download Kubernetes, visit the [download](/releases/download/) section.
|
|||
<button id="desktopShowVideoButton" onclick="kub.showVideo()">Watch Video</button>
|
||||
|
||||
<h3>Attend upcoming KubeCon + CloudNativeCon events</h3>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" class="desktopKCButton"><strong>Europe</strong> (London, Apr 1-4)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-china/" class="desktopKCButton"><strong>China</strong> (Hong Kong, Jun 10-11)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-japan/" class="desktopKCButton"><strong>Japan</strong> (Tokyo, Jun 16-17)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america-2025/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
|
||||
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mar 23-26, 2026)</a>
|
||||
</div>
|
||||
<div id="videoPlayer">
|
||||
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>
|
||||
|
|
|
@ -17,7 +17,7 @@ The GA milestone indicates that Kubernetes users may depend on the feature and i
|
|||
|
||||
Although prior to CSI Kubernetes provided a powerful volume plugin system, it was challenging to add support for new volume plugins to Kubernetes: volume plugins were “in-tree” meaning their code was part of the core Kubernetes code and shipped with the core Kubernetes binaries—vendors wanting to add support for their storage system to Kubernetes (or even fix a bug in an existing volume plugin) were forced to align with the Kubernetes release process. In addition, third-party storage code caused reliability and security issues in core Kubernetes binaries and the code was often difficult (and in some cases impossible) for Kubernetes maintainers to test and maintain.
|
||||
|
||||
CSI was developed as a standard for exposing arbitrary block and file storage storage systems to containerized workloads on Container Orchestration Systems (COs) like Kubernetes. With the adoption of the Container Storage Interface, the Kubernetes volume layer becomes truly extensible. Using CSI, third-party storage providers can write and deploy plugins exposing new storage systems in Kubernetes without ever having to touch the core Kubernetes code. This gives Kubernetes users more options for storage and makes the system more secure and reliable.
|
||||
CSI was developed as a standard for exposing arbitrary block and file storage systems to containerized workloads on Container Orchestration Systems (COs) like Kubernetes. With the adoption of the Container Storage Interface, the Kubernetes volume layer becomes truly extensible. Using CSI, third-party storage providers can write and deploy plugins exposing new storage systems in Kubernetes without ever having to touch the core Kubernetes code. This gives Kubernetes users more options for storage and makes the system more secure and reliable.
|
||||
|
||||
## What’s new?
|
||||
|
||||
|
|
|
@ -90,3 +90,12 @@ In the test, we created 400 Secrets, each containing 1 MB of data, and used info
|
|||
The results were alarming, only 16 informers were needed to cause the test server to run out of memory and crash, demonstrating how quickly memory consumption can grow under such conditions.
|
||||
|
||||
Special shout out to [@deads2k](https://github.com/deads2k) for his help in shaping this feature.
|
||||
|
||||
## Kubernetes 1.33 update
|
||||
|
||||
Since this feature was started, [Marek Siarkowicz](https://github.com/serathius) integrated a new technology into the
|
||||
Kubernetes API server: _streaming collection encoding_.
|
||||
Kubernetes v1.33 introduced two related feature gates, `StreamingCollectionEncodingToJSON` and `StreamingCollectionEncodingToProtobuf`.
|
||||
These features encode via a stream and avoid allocating all the memory at once.
|
||||
This functionality is bit-for-bit compatible with existing **list** encodings, produces even greater server-side memory savings, and doesn't require any changes to client code.
|
||||
In 1.33, the `WatchList` feature gate is disabled by default.
|
||||
|
|
|
@ -7,7 +7,7 @@ author: >
|
|||
Tabitha Sable (Kubernetes Security Response Committee)
|
||||
---
|
||||
|
||||
Today, the ingress-nginx maintainers have [released patches for a batch of critical vulnerabilities](https://github.com/kubernetes/ingress-nginx/releases) that could make it easy for attackers to take over your Kubernetes cluster. If you are among the over 40% of Kubernetes administrators using [ingress-nginx](https://github.com/kubernetes/ingress-nginx/), you should take action immediately to protect your users and data.
|
||||
Today, the ingress-nginx maintainers have released patches for a batch of critical vulnerabilities that could make it easy for attackers to take over your Kubernetes cluster: [ingress-nginx v1.12.1](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.12.1) and [ingress-nginx v1.11.5](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.11.5). If you are among the over 40% of Kubernetes administrators using [ingress-nginx](https://github.com/kubernetes/ingress-nginx/), you should take action immediately to protect your users and data.
|
||||
|
||||
## Background
|
||||
|
||||
|
@ -23,7 +23,7 @@ Four of today’s ingress-nginx vulnerabilities are improvements to how ingress-
|
|||
|
||||
The most serious of today’s vulnerabilities, [CVE-2025-1974](https://github.com/kubernetes/kubernetes/issues/131009), rated [9.8 CVSS](https://www.first.org/cvss/calculator/3-1#CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H), allows anything on the Pod network to exploit configuration injection vulnerabilities via the Validating Admission Controller feature of ingress-nginx. This makes such vulnerabilities far more dangerous: ordinarily one would need to be able to create an Ingress object in the cluster, which is a fairly privileged action. When combined with today’s other vulnerabilities, **CVE-2025-1974 means that anything on the Pod network has a good chance of taking over your Kubernetes cluster, with no credentials or administrative access required**. In many common scenarios, the Pod network is accessible to all workloads in your cloud VPC, or even anyone connected to your corporate network\! This is a very serious situation.
|
||||
|
||||
Today, we have [released ingress-nginx v1.12.1 and v1.11.5](https://github.com/kubernetes/ingress-nginx/releases), which have fixes for all five of these vulnerabilities.
|
||||
Today, we have released [ingress-nginx v1.12.1](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.12.1) and [ingress-nginx v1.11.5](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.11.5), which have fixes for all five of these vulnerabilities.
|
||||
|
||||
## Your next steps
|
||||
|
||||
|
@ -52,3 +52,5 @@ Thanks go out to Nir Ohfeld, Sagi Tzadik, Ronen Shustin, and Hillai Ben-Sasson f
|
|||
For further information about the maintenance and future of ingress-nginx, please see this [GitHub issue](https://github.com/kubernetes/ingress-nginx/issues/13002) and/or attend [James and Marco’s KubeCon/CloudNativeCon EU 2025 presentation](https://kccnceu2025.sched.com/event/1tcyc/).
|
||||
|
||||
For further information about the specific vulnerabilities discussed in this article, please see the appropriate GitHub issue: [CVE-2025-24513](https://github.com/kubernetes/kubernetes/issues/131005), [CVE-2025-24514](https://github.com/kubernetes/kubernetes/issues/131006), [CVE-2025-1097](https://github.com/kubernetes/kubernetes/issues/131007), [CVE-2025-1098](https://github.com/kubernetes/kubernetes/issues/131008), or [CVE-2025-1974](https://github.com/kubernetes/kubernetes/issues/131009)
|
||||
|
||||
*This blog post was revised in May 2025 to update the hyperlinks.*
|
|
@ -25,7 +25,7 @@ release; make sure to read about those if you already run an older version of Ku
|
|||
{{< figure src="k8s-1.33.svg" alt="Kubernetes v1.33 logo: Octarine" class="release-logo" >}}
|
||||
|
||||
The theme for Kubernetes v1.33 is **Octarine: The Color of Magic**<sup>1</sup>, inspired by Terry
|
||||
Pratchett’s _Discworld_ series. This release highlights the open-source magic<sup>2</sup> that
|
||||
Pratchett’s _Discworld_ series. This release highlights the open source magic<sup>2</sup> that
|
||||
Kubernetes enables across the ecosystem.
|
||||
|
||||
If you’re familiar with the world of Discworld, you might recognize a small swamp dragon perched
|
||||
|
@ -38,7 +38,7 @@ release is a reminder that, as Pratchett wrote, _“It’s still magic even if y
|
|||
Even if you know the ins and outs of the Kubernetes code base, stepping back at the end of the
|
||||
release cycle, you’ll realize that Kubernetes remains magical.
|
||||
|
||||
Kubernetes v1.33 is a testament to the enduring power of open-source innovation, where hundreds of
|
||||
Kubernetes v1.33 is a testament to the enduring power of open source innovation, where hundreds of
|
||||
contributors<sup>4</sup> from around the world work together to create something truly
|
||||
extraordinary. Behind every new feature, the Kubernetes community works to maintain and improve the
|
||||
project, ensuring it remains secure, reliable, and released on time. Each release builds upon the
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Continuing the transition from Endpoints to EndpointSlices"
|
||||
title: "Kubernetes v1.33: Continuing the transition from Endpoints to EndpointSlices"
|
||||
slug: endpoints-deprecation
|
||||
date: 2025-XX-XX
|
||||
draft: true
|
||||
date: 2025-04-24T10:30:00-08:00
|
||||
author: >
|
||||
Dan Winship (Red Hat)
|
||||
---
|
||||
|
@ -22,7 +21,7 @@ As of Kubernetes 1.33, the Endpoints API is now officially deprecated,
|
|||
and the API server will return warnings to users who read or write
|
||||
Endpoints resources rather than using EndpointSlices.
|
||||
|
||||
Eventually, the plan (as documented in [KEP-4794]) is to change the
|
||||
Eventually, the plan (as documented in [KEP-4974]) is to change the
|
||||
[Kubernetes Conformance] criteria to no longer require that clusters
|
||||
run the _Endpoints controller_ (which generates Endpoints objects
|
||||
based on Services and Pods), to avoid doing work that is unneeded in
|
|
@ -1,8 +1,7 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: User Namespaces enabled by default!"
|
||||
date: 2025-04-23
|
||||
draft: true
|
||||
date: 2025-04-25T10:30:00-08:00
|
||||
slug: userns-enabled-by-default
|
||||
author: >
|
||||
Rodrigo Campos Catelin (Microsoft),
|
|
@ -1,14 +1,9 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: HorizontalPodAutoscaler Configurable Tolerance"
|
||||
slug: kubernetes-1-33-hpa-configurable-tolerance
|
||||
# after the v1.33 release, set a future publication date and remove the draft marker
|
||||
# the release comms team can confirm which date has been assigned
|
||||
#
|
||||
# PRs to remove the draft marker should be opened BEFORE release day
|
||||
draft: true
|
||||
slug: kubernetes-v1-33-hpa-configurable-tolerance
|
||||
math: true # for formulae
|
||||
date: XXXX-XX-XX
|
||||
date: 2025-04-28T10:30:00-08:00
|
||||
author: "Jean-Marc François (Google)"
|
||||
---
|
||||
|
||||
|
@ -23,7 +18,7 @@ automatically resize by adding or removing replicas based on resource
|
|||
utilization.
|
||||
|
||||
Let's say you have a web application running in a Kubernetes cluster with 50
|
||||
replicas. You configure the Horizontal Pod Autoscaler (HPA) to scale based on
|
||||
replicas. You configure the HorizontalPodAutoscaler (HPA) to scale based on
|
||||
CPU utilization, with a target of 75% utilization. Now, imagine that the current
|
||||
CPU utilization across all replicas is 90%, which is higher than the desired
|
||||
75%. The HPA will calculate the required number of replicas using the formula:
|
|
@ -0,0 +1,106 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Image Volumes graduate to beta!"
|
||||
date: 2025-04-29T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-image-volume-beta
|
||||
author: Sascha Grunert (Red Hat)
|
||||
---
|
||||
|
||||
[Image Volumes](/blog/2024/08/16/kubernetes-1-31-image-volume-source) were
|
||||
introduced as an Alpha feature with the Kubernetes v1.31 release as part of
|
||||
[KEP-4639](https://github.com/kubernetes/enhancements/issues/4639). In Kubernetes v1.33, this feature graduates to **beta**.
|
||||
|
||||
Please note that the feature is still _disabled_ by default, because not all
|
||||
[container runtimes](/docs/setup/production-environment/container-runtimes) have
|
||||
full support for it. [CRI-O](https://cri-o.io) supports the initial feature since version v1.31 and
|
||||
will add support for Image Volumes as beta in v1.33.
|
||||
[containerd merged](https://github.com/containerd/containerd/pull/10579) support
|
||||
for the alpha feature which will be part of the v2.1.0 release and is working on
|
||||
beta support as part of [PR #11578](https://github.com/containerd/containerd/pull/11578).
|
||||
|
||||
### What's new
|
||||
|
||||
The major change for the beta graduation of Image Volumes is the support for
|
||||
[`subPath`](/docs/concepts/storage/volumes/#using-subpath) and
|
||||
[`subPathExpr`](/docs/concepts/storage/volumes/#using-subpath-expanded-environment) mounts
|
||||
for containers via `spec.containers[*].volumeMounts.[subPath,subPathExpr]`. This
|
||||
allows end-users to mount a certain subdirectory of an image volume, which is
|
||||
still mounted as readonly (`noexec`). This means that non-existing
|
||||
subdirectories cannot be mounted by default. As for other `subPath` and
|
||||
`subPathExpr` values, Kubernetes will ensure that there are no absolute path or
|
||||
relative path components part of the specified sub path. Container runtimes are
|
||||
also required to double check those requirements for safety reasons. If a
|
||||
specified subdirectory does not exist within a volume, then runtimes should fail
|
||||
on container creation and provide user feedback by using existing kubelet
|
||||
events.
|
||||
|
||||
Besides that, there are also three new kubelet metrics available for image volumes:
|
||||
|
||||
- `kubelet_image_volume_requested_total`: Outlines the number of requested image volumes.
|
||||
- `kubelet_image_volume_mounted_succeed_total`: Counts the number of successful image volume mounts.
|
||||
- `kubelet_image_volume_mounted_errors_total`: Accounts the number of failed image volume mounts.
|
||||
|
||||
To use an existing subdirectory for a specific image volume, just use it as
|
||||
[`subPath`](/docs/concepts/storage/volumes/#using-subpath) (or
|
||||
[`subPathExpr`](/docs/concepts/storage/volumes/#using-subpath-expanded-environment))
|
||||
value of the containers `volumeMounts`:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: image-volume
|
||||
spec:
|
||||
containers:
|
||||
- name: shell
|
||||
command: ["sleep", "infinity"]
|
||||
image: debian
|
||||
volumeMounts:
|
||||
- name: volume
|
||||
mountPath: /volume
|
||||
subPath: dir
|
||||
volumes:
|
||||
- name: volume
|
||||
image:
|
||||
reference: quay.io/crio/artifact:v2
|
||||
pullPolicy: IfNotPresent
|
||||
```
|
||||
|
||||
Then, create the pod on your cluster:
|
||||
|
||||
```shell
|
||||
kubectl apply -f image-volumes-subpath.yaml
|
||||
```
|
||||
|
||||
Now you can attach to the container:
|
||||
|
||||
```shell
|
||||
kubectl attach -it image-volume bash
|
||||
```
|
||||
|
||||
And check the content of the file from the `dir` sub path in the volume:
|
||||
|
||||
```shell
|
||||
cat /volume/file
|
||||
```
|
||||
|
||||
The output will be similar to:
|
||||
|
||||
```none
|
||||
1
|
||||
```
|
||||
|
||||
Thank you for reading through the end of this blog post! SIG Node is proud and
|
||||
happy to deliver this feature graduation as part of Kubernetes v1.33.
|
||||
|
||||
As writer of this blog post, I would like to emphasize my special thanks to
|
||||
**all** involved individuals out there!
|
||||
|
||||
If you would like to provide feedback or suggestions feel free to reach out
|
||||
to SIG Node using the [Kubernetes Slack (#sig-node)](https://kubernetes.slack.com/messages/sig-node)
|
||||
channel or the [SIG Node mailing list](https://groups.google.com/g/kubernetes-sig-node).
|
||||
|
||||
## Further reading
|
||||
|
||||
- [Use an Image Volume With a Pod](/docs/tasks/configure-pod-container/image-volumes)
|
||||
- [`image` volume overview](/docs/concepts/storage/volumes/#image)
|
|
@ -0,0 +1,68 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Storage Capacity Scoring of Nodes for Dynamic Provisioning (alpha)"
|
||||
date: 2025-04-30T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-storage-capacity-scoring-feature
|
||||
author: >
|
||||
Yuma Ogami (Cybozu)
|
||||
---
|
||||
|
||||
Kubernetes v1.33 introduces a new alpha feature called `StorageCapacityScoring`. This feature adds a scoring method for pod scheduling
|
||||
with [the topology-aware volume provisioning](/blog/2018/10/11/topology-aware-volume-provisioning-in-kubernetes/).
|
||||
This feature eases to schedule pods on nodes with either the most or least available storage capacity.
|
||||
|
||||
## About this feature
|
||||
|
||||
This feature extends the kube-scheduler's VolumeBinding plugin to perform scoring using node storage capacity information
|
||||
obtained from [Storage Capacity](/docs/concepts/storage/storage-capacity/). Currently, you can only filter out nodes with insufficient storage capacity.
|
||||
So, you have to use a scheduler extender to achieve storage-capacity-based pod scheduling.
|
||||
|
||||
This feature is useful for provisioning node-local PVs, which have size limits based on the node's storage capacity. By using this feature,
|
||||
you can assign the PVs to the nodes with the most available storage space so that you can expand the PVs later as much as possible.
|
||||
|
||||
In another use case, you might want to reduce the number of nodes as much as possible for low operation costs in cloud environments by choosing
|
||||
the least storage capacity node. This feature helps maximize resource utilization by filling up nodes more sequentially, starting with the most
|
||||
utilized nodes first that still have enough storage capacity for the requested volume size.
|
||||
|
||||
## How to use
|
||||
|
||||
### Enabling the feature
|
||||
|
||||
In the alpha phase, `StorageCapacityScoring` is disabled by default. To use this feature, add `StorageCapacityScoring=true`
|
||||
to the kube-scheduler command line option `--feature-gates`.
|
||||
|
||||
### Configuration changes
|
||||
|
||||
You can configure node priorities based on storage utilization using the `shape` parameter in the VolumeBinding plugin configuration.
|
||||
This allows you to prioritize nodes with higher available storage capacity (default) or, conversely, nodes with lower available storage capacity.
|
||||
For example, to prioritize lower available storage capacity, configure `KubeSchedulerConfiguration` as follows:
|
||||
|
||||
```yaml
|
||||
apiVersion: kubescheduler.config.k8s.io/v1
|
||||
kind: KubeSchedulerConfiguration
|
||||
profiles:
|
||||
...
|
||||
pluginConfig:
|
||||
- name: VolumeBinding
|
||||
args:
|
||||
...
|
||||
shape:
|
||||
- utilization: 0
|
||||
score: 0
|
||||
- utilization: 100
|
||||
score: 10
|
||||
```
|
||||
|
||||
For more details, please refer to the [documentation](/docs/reference/config-api/kube-scheduler-config.v1/#kubescheduler-config-k8s-io-v1-VolumeBindingArgs).
|
||||
|
||||
## Further reading
|
||||
|
||||
- [KEP-4049: Storage Capacity Scoring of Nodes for Dynamic Provisioning](https://github.com/kubernetes/enhancements/blob/master/keps/sig-storage/4049-storage-capacity-scoring-of-nodes-for-dynamic-provisioning/README.md)
|
||||
|
||||
## Additional note: Relationship with VolumeCapacityPriority
|
||||
|
||||
The alpha feature gate `VolumeCapacityPriority`, which performs node scoring based on available storage capacity during static provisioning,
|
||||
will be deprecated and replaced by `StorageCapacityScoring`.
|
||||
|
||||
Please note that while `VolumeCapacityPriority` prioritizes nodes with lower available storage capacity by default,
|
||||
`StorageCapacityScoring` prioritizes nodes with higher available storage capacity by default.
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: New features in DRA"
|
||||
slug: kubernetes-v1-33-dra-updates
|
||||
date: 2025-05-01T10:30:00-08:00
|
||||
author: >
|
||||
[Morten Torkildsen](https://github.com/mortent) (Google)
|
||||
[Patrick Ohly](https://github.com/pohly) (Intel)
|
||||
---
|
||||
|
||||
Kubernetes [Dynamic Resource Allocation](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/) (DRA) was originally introduced as an alpha feature in the v1.26 release, and then went through a significant redesign for Kubernetes v1.31. The main DRA feature went to beta in v1.32, and the project hopes it will be generally available in Kubernetes v1.34.
|
||||
|
||||
The basic feature set of DRA provides a far more powerful and flexible API for requesting devices than Device Plugin. And while DRA remains a beta feature for v1.33, the DRA team has been hard at work implementing a number of new features and UX improvements. One feature has been promoted to beta, while a number of new features have been added in alpha. The team has also made progress towards getting DRA ready for GA.
|
||||
|
||||
### Features promoted to beta
|
||||
|
||||
[Driver-owned Resource Claim Status](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#resourceclaim-device-status) was promoted to beta. This allows the driver to report driver-specific device status data for each allocated device in a resource claim, which is particularly useful for supporting network devices.
|
||||
|
||||
### New alpha features
|
||||
|
||||
[Partitionable Devices](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#partitionable-devices) lets a driver advertise several overlapping logical devices (“partitions”), and the driver can reconfigure the physical device dynamically based on the actual devices allocated. This makes it possible to partition devices on-demand to meet the needs of the workloads and therefore increase the utilization.
|
||||
|
||||
[Device Taints and Tolerations](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#device-taints-and-tolerations) allow devices to be tainted and for workloads to tolerate those taints. This makes it possible for drivers or cluster administrators to mark devices as unavailable. Depending on the effect of the taint, this can prevent devices from being allocated or cause eviction of pods that are using the device.
|
||||
|
||||
[Prioritized List](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#prioritized-list) lets users specify a list of acceptable devices for their workloads, rather than just a single type of device. So while the workload might run best on a single high-performance GPU, it might also be able to run on 2 mid-level GPUs. The scheduler will attempt to satisfy the alternatives in the list in order, so the workload will be allocated the best set of devices available in the cluster.
|
||||
|
||||
[Admin Access](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#admin-access) has been updated so that only users with access to a namespace with the `resource.k8s.io/admin-access: "true"` label are authorized to create ResourceClaim or ResourceClaimTemplates objects with the `adminAccess` field within the namespace. This grants administrators access to in-use devices and may enable additional permissions when making the device available in a container. This ensures that non-admin users cannot misuse the feature.
|
||||
|
||||
### Preparing for general availability
|
||||
|
||||
A new v1beta2 API has been added to simplify the user experience and to prepare for additional features being added in the future. The RBAC rules for DRA have been improved and support has been added for seamless upgrades of DRA drivers.
|
||||
|
||||
### What’s next?
|
||||
|
||||
The plan for v1.34 is even more ambitious than for v1.33. Most importantly, we (the Kubernetes device management working group) hope to bring DRA to general availability, which will make it available by default on all v1.34 Kubernetes clusters. This also means that many, perhaps all, of the DRA features that are still beta in v1.34 will become enabled by default, making it much easier to use them.
|
||||
|
||||
The alpha features that were added in v1.33 will be brought to beta in v1.34.
|
||||
|
||||
### Getting involved
|
||||
|
||||
A good starting point is joining the WG Device Management [Slack channel](https://kubernetes.slack.com/archives/C0409NGC1TK) and [meetings](https://docs.google.com/document/d/1qxI87VqGtgN7EAJlqVfxx86HGKEAc2A3SKru8nJHNkQ/edit?tab=t.0#heading=h.tgg8gganowxq), which happen at US/EU and EU/APAC friendly time slots.
|
||||
|
||||
Not all enhancement ideas are tracked as issues yet, so come talk to us if you want to help or have some ideas yourself! We have work to do at all levels, from difficult core changes to usability enhancements in kubectl, which could be picked up by newcomers.
|
||||
|
||||
### Acknowledgments
|
||||
|
||||
A huge thanks to everyone who has contributed:
|
||||
|
||||
* Cici Huang ([cici37](https://github.com/cici37))
|
||||
* Ed Bartosh ([bart0sh](https://github.com/bart0sh])
|
||||
* John Belamaric ([johnbelamaric](https://github.com/johnbelamaric))
|
||||
* Jon Huhn ([nojnhuh](https://github.com/nojnhuh))
|
||||
* Kevin Klues ([klueska](https://github.com/klueska))
|
||||
* Morten Torkildsen ([mortent](https://github.com/mortent))
|
||||
* Patrick Ohly ([pohly](https://github.com/pohly))
|
||||
* Rita Zhang ([ritazh](https://github.com/ritazh))
|
||||
* Shingo Omura ([everpeace](https://github.com/everpeace))
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Mutable CSI Node Allocatable Count"
|
||||
date: 2025-05-02T10:30:00-08:00
|
||||
slug: kubernetes-1-33-mutable-csi-node-allocatable-count
|
||||
author: Eddie Torres (Amazon Web Services)
|
||||
---
|
||||
|
||||
Scheduling stateful applications reliably depends heavily on accurate information about resource availability on nodes.
|
||||
Kubernetes v1.33 introduces an alpha feature called *mutable CSI node allocatable count*, allowing Container Storage Interface (CSI) drivers to dynamically update the reported maximum number of volumes that a node can handle.
|
||||
This capability significantly enhances the accuracy of pod scheduling decisions and reduces scheduling failures caused by outdated volume capacity information.
|
||||
|
||||
## Background
|
||||
|
||||
Traditionally, Kubernetes CSI drivers report a static maximum volume attachment limit when initializing. However, actual attachment capacities can change during a node's lifecycle for various reasons, such as:
|
||||
|
||||
- Manual or external operations attaching/detaching volumes outside of Kubernetes control.
|
||||
- Dynamically attached network interfaces or specialized hardware (GPUs, NICs, etc.) consuming available slots.
|
||||
- Multi-driver scenarios, where one CSI driver’s operations affect available capacity reported by another.
|
||||
|
||||
Static reporting can cause Kubernetes to schedule pods onto nodes that appear to have capacity but don't, leading to pods stuck in a `ContainerCreating` state.
|
||||
|
||||
## Dynamically adapting CSI volume limits
|
||||
|
||||
With the new feature gate `MutableCSINodeAllocatableCount`, Kubernetes enables CSI drivers to dynamically adjust and report node attachment capacities at runtime. This ensures that the scheduler has the most accurate, up-to-date view of node capacity.
|
||||
|
||||
### How it works
|
||||
|
||||
When this feature is enabled, Kubernetes supports two mechanisms for updating the reported node volume limits:
|
||||
|
||||
- **Periodic Updates:** CSI drivers specify an interval to periodically refresh the node's allocatable capacity.
|
||||
- **Reactive Updates:** An immediate update triggered when a volume attachment fails due to exhausted resources (`ResourceExhausted` error).
|
||||
|
||||
### Enabling the feature
|
||||
|
||||
To use this alpha feature, you must enable the `MutableCSINodeAllocatableCount` feature gate in these components:
|
||||
|
||||
- `kube-apiserver`
|
||||
- `kubelet`
|
||||
|
||||
### Example CSI driver configuration
|
||||
|
||||
Below is an example of configuring a CSI driver to enable periodic updates every 60 seconds:
|
||||
|
||||
```
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: CSIDriver
|
||||
metadata:
|
||||
name: example.csi.k8s.io
|
||||
spec:
|
||||
nodeAllocatableUpdatePeriodSeconds: 60
|
||||
```
|
||||
|
||||
This configuration directs Kubelet to periodically call the CSI driver's `NodeGetInfo` method every 60 seconds, updating the node’s allocatable volume count. Kubernetes enforces a minimum update interval of 10 seconds to balance accuracy and resource usage.
|
||||
|
||||
### Immediate updates on attachment failures
|
||||
|
||||
In addition to periodic updates, Kubernetes now reacts to attachment failures. Specifically, if a volume attachment fails with a `ResourceExhausted` error (gRPC code `8`), an immediate update is triggered to correct the allocatable count promptly.
|
||||
|
||||
This proactive correction prevents repeated scheduling errors and helps maintain cluster health.
|
||||
|
||||
## Getting started
|
||||
|
||||
To experiment with mutable CSI node allocatable count in your Kubernetes v1.33 cluster:
|
||||
|
||||
1. Enable the feature gate `MutableCSINodeAllocatableCount` on the `kube-apiserver` and `kubelet` components.
|
||||
2. Update your CSI driver configuration by setting `nodeAllocatableUpdatePeriodSeconds`.
|
||||
3. Monitor and observe improvements in scheduling accuracy and pod placement reliability.
|
||||
|
||||
## Next steps
|
||||
|
||||
This feature is currently in alpha and the Kubernetes community welcomes your feedback. Test it, share your experiences, and help guide its evolution toward beta and GA stability.
|
||||
|
||||
Join discussions in the [Kubernetes Storage Special Interest Group (SIG-Storage)](https://github.com/kubernetes/community/tree/master/sig-storage) to shape the future of Kubernetes storage capabilities.
|
|
@ -0,0 +1,127 @@
|
|||
---
|
||||
layout: blog
|
||||
title: 'Kubernetes v1.33: Prevent PersistentVolume Leaks When Deleting out of Order graduates to GA'
|
||||
date: 2025-05-05T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-prevent-persistentvolume-leaks-when-deleting-out-of-order-graduate-to-ga
|
||||
author: >
|
||||
Deepak Kinni (Broadcom)
|
||||
---
|
||||
|
||||
I am thrilled to announce that the feature to prevent
|
||||
[PersistentVolume](/docs/concepts/storage/persistent-volumes/) (or PVs for short)
|
||||
leaks when deleting out of order has graduated to General Availability (GA) in
|
||||
Kubernetes v1.33! This improvement, initially introduced as a beta
|
||||
feature in Kubernetes v1.31, ensures that your storage resources are properly
|
||||
reclaimed, preventing unwanted leaks.
|
||||
|
||||
## How did reclaim work in previous Kubernetes releases?
|
||||
|
||||
[PersistentVolumeClaim](/docs/concepts/storage/persistent-volumes/#Introduction) (or PVC for short) is
|
||||
a user's request for storage. A PV and PVC are considered [Bound](/docs/concepts/storage/persistent-volumes/#Binding)
|
||||
if a newly created PV or a matching PV is found. The PVs themselves are
|
||||
backed by volumes allocated by the storage backend.
|
||||
|
||||
Normally, if the volume is to be deleted, then the expectation is to delete the
|
||||
PVC for a bound PV-PVC pair. However, there are no restrictions on deleting a PV
|
||||
before deleting a PVC.
|
||||
|
||||
For a `Bound` PV-PVC pair, the ordering of PV-PVC deletion determines whether
|
||||
the PV reclaim policy is honored. The reclaim policy is honored if the PVC is
|
||||
deleted first; however, if the PV is deleted prior to deleting the PVC, then the
|
||||
reclaim policy is not exercised. As a result of this behavior, the associated
|
||||
storage asset in the external infrastructure is not removed.
|
||||
|
||||
## PV reclaim policy with Kubernetes v1.33
|
||||
|
||||
With the graduation to GA in Kubernetes v1.33, this issue is now resolved. Kubernetes
|
||||
now reliably honors the configured `Delete` reclaim policy, even when PVs are deleted
|
||||
before their bound PVCs. This is achieved through the use of finalizers,
|
||||
ensuring that the storage backend releases the allocated storage resource as intended.
|
||||
|
||||
### How does it work?
|
||||
|
||||
For CSI volumes, the new behavior is achieved by adding a [finalizer](/docs/concepts/overview/working-with-objects/finalizers/) `external-provisioner.volume.kubernetes.io/finalizer`
|
||||
on new and existing PVs. The finalizer is only removed after the storage from the backend is deleted. Addition or removal of finalizer is handled by `external-provisioner`
|
||||
`
|
||||
|
||||
An example of a PV with the finalizer, notice the new finalizer in the finalizers list
|
||||
|
||||
```
|
||||
kubectl get pv pvc-a7b7e3ba-f837-45ba-b243-dec7d8aaed53 -o yaml
|
||||
```
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
annotations:
|
||||
pv.kubernetes.io/provisioned-by: csi.example.driver.com
|
||||
creationTimestamp: "2021-11-17T19:28:56Z"
|
||||
finalizers:
|
||||
- kubernetes.io/pv-protection
|
||||
- external-provisioner.volume.kubernetes.io/finalizer
|
||||
name: pvc-a7b7e3ba-f837-45ba-b243-dec7d8aaed53
|
||||
resourceVersion: "194711"
|
||||
uid: 087f14f2-4157-4e95-8a70-8294b039d30e
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
capacity:
|
||||
storage: 1Gi
|
||||
claimRef:
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
name: example-vanilla-block-pvc
|
||||
namespace: default
|
||||
resourceVersion: "194677"
|
||||
uid: a7b7e3ba-f837-45ba-b243-dec7d8aaed53
|
||||
csi:
|
||||
driver: csi.example.driver.com
|
||||
fsType: ext4
|
||||
volumeAttributes:
|
||||
storage.kubernetes.io/csiProvisionerIdentity: 1637110610497-8081-csi.example.driver.com
|
||||
type: CNS Block Volume
|
||||
volumeHandle: 2dacf297-803f-4ccc-afc7-3d3c3f02051e
|
||||
persistentVolumeReclaimPolicy: Delete
|
||||
storageClassName: example-vanilla-block-sc
|
||||
volumeMode: Filesystem
|
||||
status:
|
||||
phase: Bound
|
||||
```
|
||||
|
||||
The [finalizer](/docs/concepts/overview/working-with-objects/finalizers/) prevents this
|
||||
PersistentVolume from being removed from the
|
||||
cluster. As stated previously, the finalizer is only removed from the PV object
|
||||
after it is successfully deleted from the storage backend. To learn more about
|
||||
finalizers, please refer to [Using Finalizers to Control Deletion](/blog/2021/05/14/using-finalizers-to-control-deletion/).
|
||||
|
||||
Similarly, the finalizer `kubernetes.io/pv-controller` is added to dynamically provisioned in-tree plugin volumes.
|
||||
|
||||
### Important note
|
||||
|
||||
The fix does not apply to statically provisioned in-tree plugin volumes.
|
||||
|
||||
## How to enable new behavior?
|
||||
|
||||
To take advantage of the new behavior, you must have upgraded your cluster to the v1.33 release of Kubernetes
|
||||
and run the CSI [`external-provisioner`](https://github.com/kubernetes-csi/external-provisioner) version `5.0.1` or later.
|
||||
The feature was released as beta in v1.31 release of Kubernetes, where it was enabled by default.
|
||||
|
||||
## References
|
||||
|
||||
* [KEP-2644](https://github.com/kubernetes/enhancements/tree/master/keps/sig-storage/2644-honor-pv-reclaim-policy)
|
||||
* [Volume leak issue](https://github.com/kubernetes-csi/external-provisioner/issues/546)
|
||||
* [Beta Release Blog](/blog/2024/08/16/kubernetes-1-31-prevent-persistentvolume-leaks-when-deleting-out-of-order/)
|
||||
|
||||
## How do I get involved?
|
||||
|
||||
The Kubernetes Slack channel [SIG Storage communication channels](https://github.com/kubernetes/community/blob/master/sig-storage/README.md#contact) are great mediums to reach out to the SIG Storage and migration working group teams.
|
||||
|
||||
Special thanks to the following people for the insightful reviews, thorough consideration and valuable contribution:
|
||||
|
||||
* Fan Baofa (carlory)
|
||||
* Jan Šafránek (jsafrane)
|
||||
* Xing Yang (xing-yang)
|
||||
* Matthew Wong (wongma7)
|
||||
|
||||
Join the [Kubernetes Storage Special Interest Group (SIG)](https://github.com/kubernetes/community/tree/master/sig-storage) if you're interested in getting involved with the design and development of CSI or any part of the Kubernetes Storage system. We’re rapidly growing and always welcome new contributors.
|
|
@ -0,0 +1,201 @@
|
|||
---
|
||||
layout: blog
|
||||
title: 'Kubernetes v1.33: Fine-grained SupplementalGroups Control Graduates to Beta'
|
||||
date: 2025-05-06T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-fine-grained-supplementalgroups-control-beta
|
||||
author: >
|
||||
Shingo Omura (LY Corporation)
|
||||
|
||||
---
|
||||
|
||||
The new field, `supplementalGroupsPolicy`, was introduced as an opt-in alpha feature for Kubernetes v1.31 and has graduated to beta in v1.33; the corresponding feature gate (`SupplementalGroupsPolicy`) is now enabled by default. This feature enables to implement more precise control over supplemental groups in containers that can strengthen the security posture, particularly in accessing volumes. Moreover, it also enhances the transparency of UID/GID details in containers, offering improved security oversight.
|
||||
|
||||
Please be aware that this beta release contains some behavioral breaking change. See [The Behavioral Changes Introduced In Beta](#the-behavioral-changes-introduced-in-beta) and [Upgrade Considerations](#upgrade-consideration) sections for details.
|
||||
|
||||
## Motivation: Implicit group memberships defined in `/etc/group` in the container image
|
||||
|
||||
Although the majority of Kubernetes cluster admins/users may not be aware, kubernetes, by default, _merges_ group information from the Pod with information defined in `/etc/group` in the container image.
|
||||
|
||||
Let's see an example, below Pod manifest specifies `runAsUser=1000`, `runAsGroup=3000` and `supplementalGroups=4000` in the Pod's security context.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: implicit-groups
|
||||
spec:
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 3000
|
||||
supplementalGroups: [4000]
|
||||
containers:
|
||||
- name: ctr
|
||||
image: registry.k8s.io/e2e-test-images/agnhost:2.45
|
||||
command: [ "sh", "-c", "sleep 1h" ]
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
```
|
||||
|
||||
What is the result of `id` command in the `ctr` container? The output should be similar to this:
|
||||
|
||||
```none
|
||||
uid=1000 gid=3000 groups=3000,4000,50000
|
||||
```
|
||||
|
||||
Where does group ID `50000` in supplementary groups (`groups` field) come from, even though `50000` is not defined in the Pod's manifest at all? The answer is `/etc/group` file in the container image.
|
||||
|
||||
Checking the contents of `/etc/group` in the container image should show below:
|
||||
|
||||
```none
|
||||
user-defined-in-image:x:1000:
|
||||
group-defined-in-image:x:50000:user-defined-in-image
|
||||
```
|
||||
|
||||
This shows that the container's primary user `1000` belongs to the group `50000` in the last entry.
|
||||
|
||||
Thus, the group membership defined in `/etc/group` in the container image for the container's primary user is _implicitly_ merged to the information from the Pod. Please note that this was a design decision the current CRI implementations inherited from Docker, and the community never really reconsidered it until now.
|
||||
|
||||
### What's wrong with it?
|
||||
|
||||
The _implicitly_ merged group information from `/etc/group` in the container image poses a security risk. These implicit GIDs can't be detected or validated by policy engines because there's no record of them in the Pod manifest. This can lead to unexpected access control issues, particularly when accessing volumes (see [kubernetes/kubernetes#112879](https://issue.k8s.io/112879) for details) because file permission is controlled by UID/GIDs in Linux.
|
||||
|
||||
## Fine-grained supplemental groups control in a Pod: `supplementaryGroupsPolicy`
|
||||
|
||||
To tackle the above problem, Pod's `.spec.securityContext` now includes `supplementalGroupsPolicy` field.
|
||||
|
||||
This field lets you control how Kubernetes calculates the supplementary groups for container processes within a Pod. The available policies are:
|
||||
|
||||
* _Merge_: The group membership defined in `/etc/group` for the container's primary user will be merged. If not specified, this policy will be applied (i.e. as-is behavior for backward compatibility).
|
||||
|
||||
* _Strict_: Only the group IDs specified in `fsGroup`, `supplementalGroups`, or `runAsGroup` are attached as supplementary groups to the container processes. Group memberships defined in `/etc/group` for the container's primary user are ignored.
|
||||
|
||||
Let's see how `Strict` policy works. Below Pod manifest specifies `supplementalGroupsPolicy: Strict`:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: strict-supplementalgroups-policy
|
||||
spec:
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 3000
|
||||
supplementalGroups: [4000]
|
||||
supplementalGroupsPolicy: Strict
|
||||
containers:
|
||||
- name: ctr
|
||||
image: registry.k8s.io/e2e-test-images/agnhost:2.45
|
||||
command: [ "sh", "-c", "sleep 1h" ]
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
```
|
||||
|
||||
The result of `id` command in the `ctr` container should be similar to this:
|
||||
|
||||
```none
|
||||
uid=1000 gid=3000 groups=3000,4000
|
||||
```
|
||||
|
||||
You can see `Strict` policy can exclude group `50000` from `groups`!
|
||||
|
||||
Thus, ensuring `supplementalGroupsPolicy: Strict` (enforced by some policy mechanism) helps prevent the implicit supplementary groups in a Pod.
|
||||
|
||||
{{<note>}}
|
||||
A container with sufficient privileges can change its process identity. The `supplementalGroupsPolicy` only affect the initial process identity. See the following section for details.
|
||||
{{</note>}}
|
||||
|
||||
## Attached process identity in Pod status
|
||||
|
||||
This feature also exposes the process identity attached to the first container process of the container
|
||||
via `.status.containerStatuses[].user.linux` field. It would be helpful to see if implicit group IDs are attached.
|
||||
|
||||
```yaml
|
||||
...
|
||||
status:
|
||||
containerStatuses:
|
||||
- name: ctr
|
||||
user:
|
||||
linux:
|
||||
gid: 3000
|
||||
supplementalGroups:
|
||||
- 3000
|
||||
- 4000
|
||||
uid: 1000
|
||||
...
|
||||
```
|
||||
|
||||
{{<note>}}
|
||||
Please note that the values in `status.containerStatuses[].user.linux` field is _the firstly attached_
|
||||
process identity to the first container process in the container. If the container has sufficient privilege
|
||||
to call system calls related to process identity (e.g. [`setuid(2)`](https://man7.org/linux/man-pages/man2/setuid.2.html), [`setgid(2)`](https://man7.org/linux/man-pages/man2/setgid.2.html) or [`setgroups(2)`](https://man7.org/linux/man-pages/man2/setgroups.2.html), etc.), the container process can change its identity. Thus, the _actual_ process identity will be dynamic.
|
||||
{{</note>}}
|
||||
|
||||
## `Strict` Policy requires newer CRI versions
|
||||
|
||||
Actually, CRI runtime (e.g. containerd, CRI-O) plays a core role for calculating supplementary group ids to be attached to the containers. Thus, `SupplementalGroupsPolicy=Strict` requires a CRI runtime that support this feature (`SupplementalGroupsPolicy: Merge` can work with the CRI runtime which does not support this feature because this policy is fully backward compatible policy).
|
||||
|
||||
Here are some CRI runtimes that support this feature, and the versions you need
|
||||
to be running:
|
||||
|
||||
- containerd: v2.0 or later
|
||||
- CRI-O: v1.31 or later
|
||||
|
||||
And, you can see if the feature is supported in the Node's `.status.features.supplementalGroupsPolicy` field.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
...
|
||||
status:
|
||||
features:
|
||||
supplementalGroupsPolicy: true
|
||||
```
|
||||
|
||||
## The behavioral changes introduced in beta
|
||||
|
||||
In the alpha release, when a Pod with `supplementalGroupsPolicy: Strict` was scheduled to a node that did not support the feature (i.e., `.status.features.supplementalGroupsPolicy=false`), the Pod's supplemental groups policy silently fell back to `Merge`.
|
||||
|
||||
In v1.33, this has entered beta to enforce the policy more strictly, where kubelet rejects pods whose nodes cannot ensure the specified policy. If your pod is rejected, you will see warning events with `reason=SupplementalGroupsPolicyNotSupported` like below:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Event
|
||||
...
|
||||
type: Warning
|
||||
reason: SupplementalGroupsPolicyNotSupported
|
||||
message: "SupplementalGroupsPolicy=Strict is not supported in this node"
|
||||
involvedObject:
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
...
|
||||
```
|
||||
|
||||
## Upgrade consideration
|
||||
|
||||
If you're already using this feature, especially the `supplementalGroupsPolicy: Strict` policy, we assume that your cluster's CRI runtimes already support this feature. In that case, you don't need to worry about the pod rejections described above.
|
||||
|
||||
However, if your cluster:
|
||||
|
||||
- uses the `supplementalGroupsPolicy: Strict` policy, but
|
||||
- its CRI runtimes do NOT yet support the feature (i.e., `.status.features.supplementalGroupsPolicy=false`),
|
||||
|
||||
you need to prepare the behavioral changes (pod rejection) when upgrading your cluster.
|
||||
|
||||
We recommend several ways to avoid unexpected pod rejections:
|
||||
|
||||
- Upgrading your cluster's CRI runtimes together with kubernetes or before the upgrade
|
||||
- Putting some label to your nodes describing CRI runtime supports this feature or not and also putting label selector to pods with `Strict` policy to select such nodes (but, you will need to monitor the number of `Pending` pods in this case instead of pod rejections).
|
||||
|
||||
## Getting involved
|
||||
|
||||
This feature is driven by the [SIG Node](https://github.com/kubernetes/community/tree/master/sig-node) community.
|
||||
Please join us to connect with the community and share your ideas and feedback around the above feature and
|
||||
beyond. We look forward to hearing from you!
|
||||
|
||||
## How can I learn more?
|
||||
|
||||
<!-- https://github.com/kubernetes/website/pull/46920 -->
|
||||
- [Configure a Security Context for a Pod or Container](/docs/tasks/configure-pod-container/security-context/)
|
||||
for the further details of `supplementalGroupsPolicy`
|
||||
- [KEP-3619: Fine-grained SupplementalGroups control](https://github.com/kubernetes/enhancements/issues/3619)
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: From Secrets to Service Accounts: Kubernetes Image Pulls Evolved"
|
||||
date: 2025-05-07T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-wi-for-image-pulls
|
||||
author: >
|
||||
[Anish Ramasekar](https://github.com/aramase) (Microsoft)
|
||||
---
|
||||
|
||||
Kubernetes has steadily evolved to reduce reliance on long-lived credentials
|
||||
stored in the API.
|
||||
A prime example of this shift is the transition of Kubernetes Service Account (KSA) tokens
|
||||
from long-lived, static tokens to ephemeral, automatically rotated tokens
|
||||
with OpenID Connect (OIDC)-compliant semantics.
|
||||
This advancement enables workloads to securely authenticate with external services
|
||||
without needing persistent secrets.
|
||||
|
||||
However, one major gap remains: **image pull authentication**.
|
||||
Today, Kubernetes clusters rely on image pull secrets stored in the API,
|
||||
which are long-lived and difficult to rotate,
|
||||
or on node-level kubelet credential providers,
|
||||
which allow any pod running on a node to access the same credentials.
|
||||
This presents security and operational challenges.
|
||||
|
||||
To address this, Kubernetes is introducing **Service Account Token Integration
|
||||
for Kubelet Credential Providers**, now available in **alpha**.
|
||||
This enhancement allows credential providers to use pod-specific service account tokens
|
||||
to obtain registry credentials, which kubelet can then use for image pulls —
|
||||
eliminating the need for long-lived image pull secrets.
|
||||
|
||||
## The problem with image pull secrets
|
||||
|
||||
Currently, Kubernetes administrators have two primary options
|
||||
for handling private container image pulls:
|
||||
|
||||
1. **Image pull secrets stored in the Kubernetes API**
|
||||
- These secrets are often long-lived because they are hard to rotate.
|
||||
- They must be explicitly attached to a service account or pod.
|
||||
- Compromise of a pull secret can lead to unauthorized image access.
|
||||
|
||||
2. **Kubelet credential providers**
|
||||
- These providers fetch credentials dynamically at the node level.
|
||||
- Any pod running on the node can access the same credentials.
|
||||
- There’s no per-workload isolation, increasing security risks.
|
||||
|
||||
Neither approach aligns with the principles of **least privilege**
|
||||
or **ephemeral authentication**, leaving Kubernetes with a security gap.
|
||||
|
||||
## The solution: Service Account token integration for Kubelet credential providers
|
||||
|
||||
This new enhancement enables kubelet credential providers
|
||||
to use **workload identity** when fetching image registry credentials.
|
||||
Instead of relying on long-lived secrets, credential providers can use
|
||||
service account tokens to request short-lived credentials
|
||||
tied to a specific pod’s identity.
|
||||
|
||||
This approach provides:
|
||||
|
||||
- **Workload-specific authentication**:
|
||||
Image pull credentials are scoped to a particular workload.
|
||||
- **Ephemeral credentials**:
|
||||
Tokens are automatically rotated, eliminating the risks of long-lived secrets.
|
||||
- **Seamless integration**:
|
||||
Works with existing Kubernetes authentication mechanisms,
|
||||
aligning with cloud-native security best practices.
|
||||
|
||||
## How it works
|
||||
|
||||
### 1. Service Account tokens for credential providers
|
||||
|
||||
Kubelet generates **short-lived, automatically rotated** tokens for service accounts
|
||||
if the credential provider it communicates with has opted into receiving
|
||||
a service account token for image pulls.
|
||||
These tokens conform to OIDC ID token semantics
|
||||
and are provided to the credential provider
|
||||
as part of the `CredentialProviderRequest`.
|
||||
The credential provider can then use this token
|
||||
to authenticate with an external service.
|
||||
|
||||
### 2. Image registry authentication flow
|
||||
|
||||
- When a pod starts, the kubelet requests credentials from a **credential provider**.
|
||||
- If the credential provider has opted in,
|
||||
the kubelet generates a **service account token** for the pod.
|
||||
- The **service account token is included in the `CredentialProviderRequest`**,
|
||||
allowing the credential provider to authenticate
|
||||
and exchange it for **temporary image pull credentials**
|
||||
from a registry (e.g. AWS ECR, GCP Artifact Registry, Azure ACR).
|
||||
- The kubelet then uses these credentials
|
||||
to pull images on behalf of the pod.
|
||||
|
||||
## Benefits of this approach
|
||||
|
||||
- **Security**:
|
||||
Eliminates long-lived image pull secrets, reducing attack surfaces.
|
||||
- **Granular Access Control**:
|
||||
Credentials are tied to individual workloads rather than entire nodes or clusters.
|
||||
- **Operational Simplicity**:
|
||||
No need for administrators to manage and rotate image pull secrets manually.
|
||||
- **Improved Compliance**:
|
||||
Helps organizations meet security policies
|
||||
that prohibit persistent credentials in the cluster.
|
||||
|
||||
## What's next?
|
||||
|
||||
For Kubernetes **v1.34**, we expect to ship this feature in **beta**
|
||||
while continuing to gather feedback from users.
|
||||
|
||||
In the coming releases, we will focus on:
|
||||
|
||||
- Implementing **caching mechanisms**
|
||||
to improve performance for token generation.
|
||||
- Giving more **flexibility to credential providers**
|
||||
to decide how the registry credentials returned to the kubelet are cached.
|
||||
- Making the feature work with
|
||||
[Ensure Secret Pulled Images](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2535-ensure-secret-pulled-images)
|
||||
to ensure pods that use an image
|
||||
are authorized to access that image
|
||||
when service account tokens are used for authentication.
|
||||
|
||||
You can learn more about this feature
|
||||
on the [service account token for image pulls](/docs/tasks/administer-cluster/kubelet-credential-provider/#service-account-token-for-image-pulls)
|
||||
page in the Kubernetes documentation.
|
||||
|
||||
You can also follow along on the
|
||||
[KEP-4412](https://kep.k8s.io/4412)
|
||||
to track progress across the coming Kubernetes releases.
|
||||
|
||||
## Try it out
|
||||
|
||||
To try out this feature:
|
||||
|
||||
1. **Ensure you are running Kubernetes v1.33 or later**.
|
||||
2. **Enable the `ServiceAccountTokenForKubeletCredentialProviders` feature gate**
|
||||
on the kubelet.
|
||||
3. **Ensure credential provider support**:
|
||||
Modify or update your credential provider
|
||||
to use service account tokens for authentication.
|
||||
4. **Update the credential provider configuration**
|
||||
to opt into receiving service account tokens
|
||||
for the credential provider by configuring the `tokenAttributes` field.
|
||||
5. **Deploy a pod**
|
||||
that uses the credential provider to pull images from a private registry.
|
||||
|
||||
We would love to hear your feedback on this feature.
|
||||
Please reach out to us on the
|
||||
[#sig-auth-authenticators-dev](https://kubernetes.slack.com/archives/C04UMAUC4UA)
|
||||
channel on Kubernetes Slack
|
||||
(for an invitation, visit [https://slack.k8s.io/](https://slack.k8s.io/)).
|
||||
|
||||
## How to get involved
|
||||
|
||||
If you are interested in getting involved
|
||||
in the development of this feature,
|
||||
sharing feedback, or participating in any other ongoing **SIG Auth** projects,
|
||||
please reach out on the
|
||||
[#sig-auth](https://kubernetes.slack.com/archives/C0EN96KUY)
|
||||
channel on Kubernetes Slack.
|
||||
|
||||
You are also welcome to join the bi-weekly
|
||||
[SIG Auth meetings](https://github.com/kubernetes/community/blob/master/sig-auth/README.md#meetings),
|
||||
held every other Wednesday.
|
|
@ -0,0 +1,79 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes 1.33: Volume Populators Graduate to GA"
|
||||
date: 2025-05-08T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-volume-populators-ga
|
||||
author: >
|
||||
Danna Wang (Google)
|
||||
Sunny Song (Google)
|
||||
---
|
||||
|
||||
Kubernetes _volume populators_ are now generally available (GA)! The `AnyVolumeDataSource` feature
|
||||
gate is treated as always enabled for Kubernetes v1.33, which means that users can specify any appropriate
|
||||
[custom resource](/docs/concepts/extend-kubernetes/api-extension/custom-resources/#custom-resources)
|
||||
as the data source of a PersistentVolumeClaim (PVC).
|
||||
|
||||
An example of how to use dataSourceRef in PVC:
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: pvc1
|
||||
spec:
|
||||
...
|
||||
dataSourceRef:
|
||||
apiGroup: provider.example.com
|
||||
kind: Provider
|
||||
name: provider1
|
||||
```
|
||||
|
||||
## What is new
|
||||
|
||||
There are four major enhancements from beta.
|
||||
|
||||
### Populator Pod is optional
|
||||
|
||||
During the beta phase, contributors to Kubernetes identified potential resource leaks with PersistentVolumeClaim (PVC) deletion while volume population was in progress; these leaks happened due to limitations in finalizer handling.
|
||||
Ahead of the graduation to general availability, the Kubernetes project added support to delete temporary resources (PVC prime, etc.) if the original PVC is deleted.
|
||||
|
||||
To accommodate this, we've introduced three new plugin-based functions:
|
||||
* `PopulateFn()`: Executes the provider-specific data population logic.
|
||||
* `PopulateCompleteFn()`: Checks if the data population operation has finished successfully.
|
||||
* `PopulateCleanupFn()`: Cleans up temporary resources created by the provider-specific functions after data population is completed
|
||||
|
||||
A provider example is added in [lib-volume-populator/example](https://github.com/kubernetes-csi/lib-volume-populator/tree/master/example).
|
||||
|
||||
### Mutator functions to modify the Kubernetes resources
|
||||
|
||||
For GA, the CSI volume populator controller code gained a `MutatorConfig`, allowing the specification of mutator functions to modify Kubernetes resources.
|
||||
For example, if the PVC prime is not an exact copy of the PVC and you need provider-specific information for the driver, you can include this information in the optional `MutatorConfig`.
|
||||
This allows you to customize the Kubernetes objects in the volume populator.
|
||||
|
||||
### Flexible metric handling for providers
|
||||
|
||||
Our beta phase highlighted a new requirement: the need to aggregate metrics not just from lib-volume-populator, but also from other components within the provider's codebase.
|
||||
|
||||
To address this, SIG Storage introduced a [provider metric manager](https://github.com/kubernetes-csi/lib-volume-populator/blob/8a922a5302fdba13a6c27328ee50e5396940214b/populator-machinery/controller.go#L122).
|
||||
This enhancement delegates the implementation of metrics logic to the provider itself, rather than relying solely on lib-volume-populator.
|
||||
This shift provides greater flexibility and control over metrics collection and aggregation, enabling a more comprehensive view of provider performance.
|
||||
|
||||
### Clean up for temporary resources
|
||||
|
||||
During the beta phase, we identified potential resource leaks with PersistentVolumeClaim (PVC) deletion while volume population was in progress, due to limitations in finalizer handling. We have improved the populator to support the deletion of temporary resources (PVC prime, etc.) if the original PVC is deleted in this GA release.
|
||||
|
||||
## How to use it
|
||||
|
||||
To try it out, please follow the [steps](/blog/2022/05/16/volume-populators-beta/#trying-it-out) in the previous beta blog.
|
||||
|
||||
## Future directions and potential feature requests
|
||||
|
||||
For next step, there are several potential feature requests for volume populator:
|
||||
|
||||
* Multi sync: the current implementation is a one-time unidirectional sync from source to destination. This can be extended to support multiple syncs, enabling periodic syncs or allowing users to sync on demand
|
||||
* Bidirectional sync: an extension of multi sync above, but making it bidirectional between source and destination
|
||||
* Populate data with priorities: with a list of different dataSourceRef, populate based on priorities
|
||||
* Populate data from multiple sources of the same provider: populate multiple different sources to one destination
|
||||
* Populate data from multiple sources of the different providers: populate multiple different sources to one destination, pipelining different resources’ population
|
||||
|
||||
To ensure we're building something truly valuable, Kubernetes SIG Storage would love to hear about any specific use cases you have in mind for this feature.
|
||||
For any inquiries or specific questions related to volume populator, please reach out to the [SIG Storage community](https://github.com/kubernetes/community/tree/master/sig-storage).
|
|
@ -0,0 +1,53 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Streaming List responses"
|
||||
date: 2025-05-09T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-streaming-list-responses
|
||||
author: >
|
||||
Marek Siarkowicz (Google),
|
||||
Wei Fu (Microsoft)
|
||||
---
|
||||
|
||||
Managing Kubernetes cluster stability becomes increasingly critical as your infrastructure grows. One of the most challenging aspects of operating large-scale clusters has been handling List requests that fetch substantial datasets - a common operation that could unexpectedly impact your cluster's stability.
|
||||
|
||||
Today, the Kubernetes community is excited to announce a significant architectural improvement: streaming encoding for List responses.
|
||||
|
||||
|
||||
## The problem: unnecessary memory consumption with large resources
|
||||
|
||||
Current API response encoders just serialize an entire response into a single contiguous memory and perform one [ResponseWriter.Write](https://pkg.go.dev/net/http#ResponseWriter.Write) call to transmit data to the client. Despite HTTP/2's capability to split responses into smaller frames for transmission, the underlying HTTP server continues to hold the complete response data as a single buffer. Even as individual frames are transmitted to the client, the memory associated with these frames cannot be freed incrementally.
|
||||
|
||||
When cluster size grows, the single response body can be substantial - like hundreds of megabytes in size. At large scale, the current approach becomes particularly inefficient, as it prevents incremental memory release during transmission. Imagining that when network congestion occurs, that large response body’s memory block stays active for tens of seconds or even minutes. This limitation leads to unnecessarily high and prolonged memory consumption in the kube-apiserver process. If multiple large List requests occur simultaneously, the cumulative memory consumption can escalate rapidly, potentially leading to an Out-of-Memory (OOM) situation that compromises cluster stability.
|
||||
|
||||
The encoding/json package uses sync.Pool to reuse memory buffers during serialization. While efficient for consistent workloads, this mechanism creates challenges with sporadic large List responses. When processing these large responses, memory pools expand significantly. But due to sync.Pool's design, these oversized buffers remain reserved after use. Subsequent small List requests continue utilizing these large memory allocations, preventing garbage collection and maintaining persistently high memory consumption in the kube-apiserver even after the initial large responses complete.
|
||||
|
||||
Additionally, [Protocol Buffers](https://github.com/protocolbuffers/protocolbuffers.github.io/blob/c14731f55296f8c6367faa4f2e55a3d3594544c6/content/programming-guides/techniques.md?plain=1#L39) are not designed to handle large datasets. But it’s great for handling **individual** messages within a large data set. This highlights the need for streaming-based approaches that can process and transmit large collections incrementally rather than as monolithic blocks.
|
||||
|
||||
> _As a general rule of thumb, if you are dealing in messages larger than a megabyte each, it may be time to consider an alternate strategy._
|
||||
>
|
||||
> _From https://protobuf.dev/programming-guides/techniques/_
|
||||
|
||||
|
||||
## Streaming encoder for List responses
|
||||
|
||||
The streaming encoding mechanism is specifically designed for List responses, leveraging their common well-defined collection structures. The core idea focuses exclusively on the **Items** field within collection structures, which represents the bulk of memory consumption in large responses. Rather than encoding the entire **Items** array as one contiguous memory block, the new streaming encoder processes and transmits each item individually, allowing memory to be freed progressively as frame or chunk is transmitted. As a result, encoding items one by one significantly reduces the memory footprint required by the API server.
|
||||
|
||||
With Kubernetes objects typically limited to 1.5 MiB (from ETCD), streaming encoding keeps memory consumption predictable and manageable regardless of how many objects are in a List response. The result is significantly improved API server stability, reduced memory spikes, and better overall cluster performance - especially in environments where multiple large List operations might occur simultaneously.
|
||||
|
||||
To ensure perfect backward compatibility, the streaming encoder validates Go struct tags rigorously before activation, guaranteeing byte-for-byte consistency with the original encoder. Standard encoding mechanisms process all fields except **Items**, maintaining identical output formatting throughout. This approach seamlessly supports all Kubernetes List types—from built-in **\*List** objects to Custom Resource **UnstructuredList** objects - requiring zero client-side modifications or awareness that the underlying encoding method has changed.
|
||||
|
||||
## Performance gains you'll notice
|
||||
|
||||
* **Reduced Memory Consumption:** Significantly lowers the memory footprint of the API server when handling large **list** requests,
|
||||
especially when dealing with **large resources**.
|
||||
* **Improved Scalability:** Enables the API server to handle more concurrent requests and larger datasets without running out of memory.
|
||||
* **Increased Stability:** Reduces the risk of OOM kills and service disruptions.
|
||||
* **Efficient Resource Utilization:** Optimizes memory usage and improves overall resource efficiency.
|
||||
|
||||
## Benchmark results
|
||||
|
||||
To validate results Kubernetes has introduced a new **list** benchmark which executes concurrently 10 **list** requests each returning 1GB of data.
|
||||
|
||||
The benchmark has showed 20x improvement, reducing memory usage from 70-80GB to 3GB.
|
||||
|
||||
{{< figure src="results.png" alt="Screenshot of a K8s performance dashboard showing memory usage for benchmark list going down from 60GB to 3GB" caption="List benchmark memory usage" >}}
|
After Width: | Height: | Size: 1.1 MiB |
After Width: | Height: | Size: 32 KiB |
|
@ -0,0 +1,136 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Image Pull Policy the way you always thought it worked!"
|
||||
date: 2025-05-12T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-ensure-secret-pulled-images-alpha
|
||||
author: >
|
||||
[Ben Petersen](https://github.com/benjaminapetersen) (Microsoft),
|
||||
[Stanislav Láznička](https://github.com/stlaz) (Microsoft)
|
||||
---
|
||||
|
||||
## Image Pull Policy the way you always thought it worked!
|
||||
|
||||
Some things in Kubernetes are surprising, and the way `imagePullPolicy` behaves might
|
||||
be one of them. Given Kubernetes is all about running pods, it may be peculiar
|
||||
to learn that there has been a caveat to restricting pod access to authenticated images for
|
||||
over 10 years in the form of [issue 18787](https://github.com/kubernetes/kubernetes/issues/18787)!
|
||||
It is an exciting release when you can resolve a ten-year-old issue.
|
||||
|
||||
{{< note >}}
|
||||
Throughout this blog post, the term "pod credentials" will be used often. In this context,
|
||||
the term generally encapsulates the authentication material that is available to a pod
|
||||
to authenticate a container image pull.
|
||||
{{< /note >}}
|
||||
|
||||
## IfNotPresent, even if I'm not supposed to have it
|
||||
|
||||
The gist of the problem is that the `imagePullPolicy: IfNotPresent` strategy has done
|
||||
precisely what it says, and nothing more. Let's set up a scenario. To begin, *Pod A* in *Namespace X* is scheduled to *Node 1* and requires *image Foo* from a private repository.
|
||||
For it's image pull authentication material, the pod references *Secret 1* in its `imagePullSecrets`. *Secret 1* contains the necessary credentials to pull from the private repository. The Kubelet will utilize the credentials from *Secret 1* as supplied by *Pod A*
|
||||
and it will pull *container image Foo* from the registry. This is the intended (and secure)
|
||||
behavior.
|
||||
|
||||
But now things get curious. If *Pod B* in *Namespace Y* happens to also be scheduled to *Node 1*, unexpected (and potentially insecure) things happen. *Pod B* may reference the same private image, specifying the `IfNotPresent` image pull policy. *Pod B* does not reference *Secret 1*
|
||||
(or in our case, any secret) in its `imagePullSecrets`. When the Kubelet tries to run the pod, it honors the `IfNotPresent` policy. The Kubelet sees that the *image Foo* is already present locally, and will provide *image Foo* to *Pod B*. *Pod B* gets to run the image even though it did not provide credentials authorizing it to pull the image in the first place.
|
||||
|
||||
{{< figure
|
||||
src="ensure_secret_image_pulls.svg"
|
||||
caption="Using a private image pulled by a different pod"
|
||||
alt="Illustration of the process of two pods trying to access a private image, the first one with a pull secret, the second one without it"
|
||||
>}}
|
||||
|
||||
While `IfNotPresent` should not pull *image Foo* if it is already present
|
||||
on the node, it is an incorrect security posture to allow all pods scheduled
|
||||
to a node to have access to previously pulled private image. These pods were never
|
||||
authorized to pull the image in the first place.
|
||||
|
||||
## IfNotPresent, but only if I am supposed to have it
|
||||
|
||||
In Kubernetes v1.33, we - SIG Auth and SIG Node - have finally started to address this (really old) problem and getting the verification right! The basic expected behavior is not changed. If
|
||||
an image is not present, the Kubelet will attempt to pull the image. The credentials each pod supplies will be utilized for this task. This matches behavior prior to 1.33.
|
||||
|
||||
If the image is present, then the behavior of the Kubelet changes. The Kubelet will now
|
||||
verify the pod's credentials before allowing the pod to use the image.
|
||||
|
||||
Performance and service stability have been a consideration while revising the feature.
|
||||
Pods utilizing the same credential will not be required to re-authenticate. This is
|
||||
also true when pods source credentials from the same Kubernetes Secret object, even
|
||||
when the credentials are rotated.
|
||||
|
||||
## Never pull, but use if authorized
|
||||
|
||||
The `imagePullPolicy: Never` option does not fetch images. However, if the
|
||||
container image is already present on the node, any pod attempting to use the private
|
||||
image will be required to provide credentials, and those credentials require verification.
|
||||
|
||||
Pods utilizing the same credential will not be required to re-authenticate.
|
||||
Pods that do not supply credentials previously used to successfully pull an
|
||||
image will not be allowed to use the private image.
|
||||
|
||||
## Always pull, if authorized
|
||||
|
||||
The `imagePullPolicy: Always` has always worked as intended. Each time an image
|
||||
is requested, the request goes to the registry and the registry will perform an authentication
|
||||
check.
|
||||
|
||||
In the past, forcing the `Always` image pull policy via pod admission was the only way to ensure
|
||||
that your private container images didn't get reused by other pods on nodes which already pulled the images.
|
||||
|
||||
Fortunately, this was somewhat performant. Only the image manifest was pulled, not the image. However, there was still a cost and a risk. During a new rollout, scale up, or pod restart, the image registry that provided the image MUST be available for the auth check, putting the image registry in the critical path for stability of services running inside of the cluster.
|
||||
|
||||
## How it all works
|
||||
|
||||
The feature is based on persistent, file-based caches that are present on each of
|
||||
the nodes. The following is a simplified description of how the feature works.
|
||||
For the complete version, please see [KEP-2535](https://kep.k8s.io/2535).
|
||||
|
||||
The process of requesting an image for the first time goes like this:
|
||||
1. A pod requesting an image from a private registry is scheduled to a node.
|
||||
1. The image is not present on the node.
|
||||
1. The Kubelet makes a record of the intention to pull the image.
|
||||
1. The Kubelet extracts credentials from the Kubernetes Secret referenced by the pod
|
||||
as an image pull secret, and uses them to pull the image from the private registry.
|
||||
1. After the image has been successfully pulled, the Kubelet makes a record of
|
||||
the successful pull. This record includes details about credentials used
|
||||
(in the form of a hash) as well as the Secret from which they originated.
|
||||
1. The Kubelet removes the original record of intent.
|
||||
1. The Kubelet retains the record of successful pull for later use.
|
||||
|
||||
When future pods scheduled to the same node request the previously pulled private image:
|
||||
1. The Kubelet checks the credentials that the new pod provides for the pull.
|
||||
1. If the hash of these credentials, or the source Secret of the credentials match
|
||||
the hash or source Secret which were recorded for a previous successful pull,
|
||||
the pod is allowed to use the previously pulled image.
|
||||
1. If the credentials or their source Secret are not found in the records of
|
||||
successful pulls for that image, the Kubelet will attempt to use
|
||||
these new credentials to request a pull from the remote registry, triggering
|
||||
the authorization flow.
|
||||
|
||||
## Try it out
|
||||
|
||||
In Kubernetes v1.33 we shipped the alpha version of this feature. To give it a spin,
|
||||
enable the `KubeletEnsureSecretPulledImages` feature gate for your 1.33 Kubelets.
|
||||
|
||||
You can learn more about the feature and additional optional configuration on the
|
||||
[concept page for Images](/docs/concepts/containers/images/#ensureimagepullcredentialverification)
|
||||
in the official Kubernetes documentation.
|
||||
|
||||
## What's next?
|
||||
|
||||
In future releases we are going to:
|
||||
1. Make this feature work together with [Projected service account tokens for Kubelet image credential providers](https://kep.k8s.io/4412) which adds a new, workload-specific source of image pull credentials.
|
||||
1. Write a benchmarking suite to measure the performance of this feature and assess the impact of
|
||||
any future changes.
|
||||
1. Implement an in-memory caching layer so that we don't need to read files for each image
|
||||
pull request.
|
||||
1. Add support for credential expirations, thus forcing previously validated credentials to
|
||||
be re-authenticated.
|
||||
|
||||
## How to get involved
|
||||
|
||||
[Reading KEP-2535](https://kep.k8s.io/2535) is a great way to understand these changes in depth.
|
||||
|
||||
If you are interested in further involvement, reach out to us on the [#sig-auth-authenticators-dev](https://kubernetes.slack.com/archives/C04UMAUC4UA) channel
|
||||
on Kubernetes Slack (for an invitation, visit [https://slack.k8s.io/](https://slack.k8s.io/)).
|
||||
You are also welcome to join the bi-weekly [SIG Auth meetings](https://github.com/kubernetes/community/blob/master/sig-auth/README.md#meetings),
|
||||
held every other Wednesday.
|
|
@ -0,0 +1,107 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Job's Backoff Limit Per Index Goes GA"
|
||||
date: 2025-05-13T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-jobs-backoff-limit-per-index-goes-ga
|
||||
author: >
|
||||
[Michał Woźniak](https://github.com/mimowo) (Google)
|
||||
---
|
||||
|
||||
In Kubernetes v1.33, the _Backoff Limit Per Index_ feature reaches general
|
||||
availability (GA). This blog describes the Backoff Limit Per Index feature and
|
||||
its benefits.
|
||||
|
||||
## About backoff limit per index
|
||||
|
||||
When you run workloads on Kubernetes, you must consider scenarios where Pod
|
||||
failures can affect the completion of your workloads. Ideally, your workload
|
||||
should tolerate transient failures and continue running.
|
||||
|
||||
To achieve failure tolerance in a Kubernetes Job, you can set the
|
||||
`spec.backoffLimit` field. This field specifies the total number of tolerated
|
||||
failures.
|
||||
|
||||
However, for workloads where every index is considered independent, like
|
||||
[embarassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel)
|
||||
workloads - the `spec.backoffLimit` field is often not flexible enough.
|
||||
For example, you may choose to run multiple suites of integration tests by
|
||||
representing each suite as an index within an [Indexed Job](/docs/tasks/job/indexed-parallel-processing-static/).
|
||||
In that setup, a fast-failing index (test suite) is likely to consume your
|
||||
entire budget for tolerating Pod failures, and you might not be able to run the
|
||||
other indexes.
|
||||
|
||||
In order to address this limitation, Kubernetes introduced _backoff limit per index_,
|
||||
which allows you to control the number of retries per index.
|
||||
|
||||
## How backoff limit per index works
|
||||
|
||||
To use Backoff Limit Per Index for Indexed Jobs, specify the number of tolerated
|
||||
Pod failures per index with the `spec.backoffLimitPerIndex` field. When you set
|
||||
this field, the Job executes all indexes by default.
|
||||
|
||||
Additionally, to fine-tune the error handling:
|
||||
* Specify the cap on the total number of failed indexes by setting the
|
||||
`spec.maxFailedIndexes` field. When the limit is exceeded the entire Job is
|
||||
terminated.
|
||||
* Define a short-circuit to detect a failed index by using the `FailIndex` action in the
|
||||
[Pod Failure Policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
|
||||
mechanism.
|
||||
|
||||
When the number of tolerated failures is exceeded, the Job marks that index as
|
||||
failed and lists it in the Job's `status.failedIndexes` field.
|
||||
|
||||
### Example
|
||||
|
||||
The following Job spec snippet is an example of how to combine backoff limit per
|
||||
index with the _Pod Failure Policy_ feature:
|
||||
|
||||
```yaml
|
||||
completions: 10
|
||||
parallelism: 10
|
||||
completionMode: Indexed
|
||||
backoffLimitPerIndex: 1
|
||||
maxFailedIndexes: 5
|
||||
podFailurePolicy:
|
||||
rules:
|
||||
- action: Ignore
|
||||
onPodConditions:
|
||||
- type: DisruptionTarget
|
||||
- action: FailIndex
|
||||
onExitCodes:
|
||||
operator: In
|
||||
values: [ 42 ]
|
||||
```
|
||||
|
||||
In this example, the Job handles Pod failures as follows:
|
||||
|
||||
- Ignores any failed Pods that have the built-in
|
||||
[disruption condition](/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions),
|
||||
called `DisruptionTarget`. These Pods don't count towards Job backoff limits.
|
||||
- Fails the index corresponding to the failed Pod if any of the failed Pod's
|
||||
containers finished with the exit code 42 - based on the matching "FailIndex"
|
||||
rule.
|
||||
- Retries the first failure of any index, unless the index failed due to the
|
||||
matching `FailIndex` rule.
|
||||
- Fails the entire Job if the number of failed indexes exceeded 5 (set by the
|
||||
`spec.maxFailedIndexes` field).
|
||||
|
||||
## Learn more
|
||||
|
||||
- Read the blog post on the closely related feature of Pod Failure Policy [Kubernetes 1.31: Pod Failure Policy for Jobs Goes GA](/blog/2024/08/19/kubernetes-1-31-pod-failure-policy-for-jobs-goes-ga/)
|
||||
- For a hands-on guide to using Pod failure policy, including the use of FailIndex, see
|
||||
[Handling retriable and non-retriable pod failures with Pod failure policy](/docs/tasks/job/pod-failure-policy/)
|
||||
- Read the documentation for
|
||||
[Backoff limit per index](/docs/concepts/workloads/controllers/job/#backoff-limit-per-index) and
|
||||
[Pod failure policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
|
||||
- Read the KEP for the [Backoff Limits Per Index For Indexed Jobs](https://github.com/kubernetes/enhancements/tree/master/keps/sig-apps/3850-backoff-limits-per-index-for-indexed-jobs)
|
||||
|
||||
## Get involved
|
||||
|
||||
This work was sponsored by the Kubernetes
|
||||
[batch working group](https://github.com/kubernetes/community/tree/master/wg-batch)
|
||||
in close collaboration with the
|
||||
[SIG Apps](https://github.com/kubernetes/community/tree/master/sig-apps) community.
|
||||
|
||||
If you are interested in working on new features in the space we recommend
|
||||
subscribing to our [Slack](https://kubernetes.slack.com/messages/wg-batch)
|
||||
channel and attending the regular community meetings.
|
|
@ -0,0 +1,77 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes v1.33: Updates to Container Lifecycle"
|
||||
date: 2025-05-14T10:30:00-08:00
|
||||
slug: kubernetes-v1-33-updates-to-container-lifecycle
|
||||
author: >
|
||||
Sreeram Venkitesh (DigitalOcean)
|
||||
---
|
||||
|
||||
Kubernetes v1.33 introduces a few updates to the lifecycle of containers. The Sleep action for container lifecycle hooks now supports a zero sleep duration (feature enabled by default).
|
||||
There is also alpha support for customizing the stop signal sent to containers when they are being terminated.
|
||||
|
||||
This blog post goes into the details of these new aspects of the container lifecycle, and how you can use them.
|
||||
|
||||
## Zero value for Sleep action
|
||||
|
||||
Kubernetes v1.29 introduced the `Sleep` action for container PreStop and PostStart Lifecycle hooks. The Sleep action lets your containers pause for a specified duration after the container is started or before it is terminated. This was needed to provide a straightforward way to manage graceful shutdowns. Before the Sleep action, folks used to run the `sleep` command using the exec action in their container lifecycle hooks. If you wanted to do this you'd need to have the binary for the `sleep` command in your container image. This is difficult if you're using third party images.
|
||||
|
||||
The sleep action when it was added initially didn't have support for a sleep duration of zero seconds. The `time.Sleep` which the Sleep action uses under the hood supports a duration of zero seconds. Using a negative or a zero value for the sleep returns immediately, resulting in a no-op. We wanted the same behaviour with the sleep action. This support for the zero duration was later added in v1.32, with the `PodLifecycleSleepActionAllowZero` feature gate.
|
||||
|
||||
The `PodLifecycleSleepActionAllowZero` feature gate has graduated to beta in v1.33, and is now enabled by default.
|
||||
The original Sleep action for `preStop` and `postStart` hooks is been enabled by default, starting from Kubernetes v1.30.
|
||||
With a cluster running Kubernetes v1.33, you are able to set a
|
||||
zero duration for sleep lifecycle hooks. For a cluster with default configuration, you don't need
|
||||
to enable any feature gate to make that possible.
|
||||
|
||||
## Container stop signals
|
||||
|
||||
Container runtimes such as containerd and CRI-O honor a `StopSignal` instruction in the container image definition. This can be used to specify a custom stop signal
|
||||
that the runtime will used to terminate containers based on that image.
|
||||
Stop signal configuration was not originally part of the Pod API in Kubernetes.
|
||||
Until Kubernetes v1.33, the only way to override the stop signal for containers was by rebuilding your container image with the new custom stop signal
|
||||
(for example, specifying `STOPSIGNAL` in a `Containerfile` or `Dockerfile`).
|
||||
|
||||
The `ContainerStopSignals` feature gate which is newly added in Kubernetes v1.33 adds stop signals to the Kubernetes API. This allows users to specify a custom stop signal in the container spec. Stop signals are added to the API as a new lifecycle along with the existing PreStop and PostStart lifecycle handlers. In order to use this feature, we expect the Pod to have the operating system specified with `spec.os.name`. This is enforced so that we can cross-validate the stop signal against the operating system and make sure that the containers in the Pod are created with a valid stop signal for the operating system the Pod is being scheduled to. For Pods scheduled on Windows nodes, only `SIGTERM` and `SIGKILL` are allowed as valid stop signals. Find the full list of signals supported in Linux nodes [here](https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/api/core/v1/types.go#L2985-L3053).
|
||||
|
||||
### Default behaviour
|
||||
|
||||
If a container has a custom stop signal defined in its lifecycle, the container runtime would use the signal defined in the lifecycle to kill the container, given that the container runtime also supports custom stop signals. If there is no custom stop signal defined in the container lifecycle, the runtime would fallback to the stop signal defined in the container image. If there is no stop signal defined in the container image, the default stop signal of the runtime would be used. The default signal is `SIGTERM` for both containerd and CRI-O.
|
||||
|
||||
### Version skew
|
||||
|
||||
For the feature to work as intended, both the versions of Kubernetes and the container runtime should support container stop signals. The changes to the Kuberentes API and kubelet are available in alpha stage from v1.33, which can be enabled with the `ContainerStopSignals` feature gate. The container runtime implementations for containerd and CRI-O are still a work in progress and will be rolled out soon.
|
||||
|
||||
### Using container stop signals
|
||||
|
||||
To enable this feature, you need to turn on the `ContainerStopSignals` feature gate in both the kube-apiserver and the kubelet. Once you have nodes where the feature gate is turned on, you can create Pods with a StopSignal lifecycle and a valid OS name like so:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nginx
|
||||
spec:
|
||||
os:
|
||||
name: linux
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:latest
|
||||
lifecycle:
|
||||
stopSignal: SIGUSR1
|
||||
```
|
||||
|
||||
Do note that the `SIGUSR1` signal in this example can only be used if the container's Pod is scheduled to a Linux node. Hence we need to specify `spec.os.name` as `linux` to be able to use the signal. You will only be able to configure `SIGTERM` and `SIGKILL` signals if the Pod is being scheduled to a Windows node. You cannot specify a `containers[*].lifecycle.stopSignal` if the `spec.os.name` field is nil or unset either.
|
||||
|
||||
## How do I get involved?
|
||||
|
||||
This feature is driven by the [SIG Node](https://github.com/Kubernetes/community/blob/master/sig-node/README.md). If you are interested in helping develop this feature, sharing feedback, or participating in any other ongoing SIG Node projects, please reach out to us!
|
||||
|
||||
You can reach SIG Node by several means:
|
||||
- Slack: [#sig-node](https://kubernetes.slack.com/messages/sig-node)
|
||||
- [Mailing list](https://groups.google.com/forum/#!forum/kubernetes-sig-node)
|
||||
- [Open Community Issues/PRs](https://github.com/kubernetes/community/labels/sig%2Fnode)
|
||||
|
||||
You can also contact me directly:
|
||||
- GitHub: @sreeram-venkitesh
|
||||
- Slack: @sreeram.venkitesh
|
After Width: | Height: | Size: 166 KiB |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 39 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 36 KiB |
|
@ -0,0 +1,395 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Announcing etcd v3.6.0"
|
||||
date: 2025-05-15T16:00:00-08:00
|
||||
slug: announcing-etcd-3.6
|
||||
author: >
|
||||
Benjamin Wang (VMware by Broadcom)
|
||||
canonicalUrl: "https://etcd.io/blog/2025/announcing-etcd-3.6/"
|
||||
---
|
||||
|
||||
_This announcement originally [appeared](https://etcd.io/blog/2025/announcing-etcd-3.6/) on the etcd blog._
|
||||
|
||||
Today, we are releasing [etcd v3.6.0][], the first minor release since etcd v3.5.0 on June 15, 2021. This release
|
||||
introduces several new features, makes significant progress on long-standing efforts like downgrade support and
|
||||
migration to v3store, and addresses numerous critical & major issues. It also includes major optimizations in
|
||||
memory usage, improving efficiency and performance.
|
||||
|
||||
In addition to the features of v3.6.0, etcd has joined Kubernetes as a SIG (sig-etcd), enabling us to improve
|
||||
project sustainability. We've introduced systematic robustness testing to ensure correctness and reliability.
|
||||
Through the etcd-operator Working Group, we plan to improve usability as well.
|
||||
|
||||
What follows are the most significant changes introduced in etcd v3.6.0, along with the discussion of the
|
||||
roadmap for future development. For a detailed list of changes, please refer to the [CHANGELOG-3.6][].
|
||||
|
||||
A heartfelt thank you to all the contributors who made this release possible!
|
||||
|
||||
## Security
|
||||
|
||||
etcd takes security seriously. To enhance software security in v3.6.0, we have improved our workflow checks by
|
||||
integrating `govulncheck` to scan the source code and `trivy` to scan container images. These improvements
|
||||
have also been backported to supported stable releases.
|
||||
|
||||
etcd continues to follow the [Security Release Process][] to ensure vulnerabilities are properly managed and addressed.
|
||||
|
||||
## Features
|
||||
|
||||
### Migration to v3store
|
||||
|
||||
The v2store has been deprecated since etcd v3.4 but could still be enabled via `--enable-v2`. It remained the source of
|
||||
truth for membership data. In etcd v3.6.0, v2store can no longer be enabled as the `--enable-v2` flag has been removed,
|
||||
and v3store has become the sole source of truth for membership data.
|
||||
|
||||
While v2store still exists in v3.6.0, etcd will fail to start if it contains any data other than membership information.
|
||||
To assist with migration, etcd v3.5.18+ provides the `etcdutl check v2store` command, which verifies that v2store
|
||||
contains only membership data (see [PR 19113][]).
|
||||
|
||||
Compared to v2store, v3store offers better performance and transactional support. It is also the actively maintained
|
||||
storage engine moving forward.
|
||||
|
||||
The removal of v2store is still ongoing and is tracked in [issues/12913][].
|
||||
|
||||
### Downgrade
|
||||
|
||||
etcd v3.6.0 is the first version to fully support downgrade. The effort for this downgrade task spans
|
||||
both versions 3.5 and 3.6, and all related work is tracked in [issues/11716][].
|
||||
|
||||
At a high level, the process involves migrating the data schema to the target version (e.g., v3.5),
|
||||
followed by a rolling downgrade.
|
||||
|
||||
Ensure the cluster is healthy, and take a snapshot backup. Validate whether the downgrade is valid:
|
||||
|
||||
```bash
|
||||
$ etcdctl downgrade validate 3.5
|
||||
Downgrade validate success, cluster version 3.6
|
||||
```
|
||||
|
||||
If the downgrade is valid, enable downgrade mode:
|
||||
|
||||
```bash
|
||||
$ etcdctl downgrade enable 3.5
|
||||
Downgrade enable success, cluster version 3.6
|
||||
```
|
||||
|
||||
etcd will then migrate the data schema in the background. Once complete, proceed with the rolling downgrade.
|
||||
|
||||
For details, refer to the [Downgrade-3.6] guide.
|
||||
|
||||
### Feature gates
|
||||
|
||||
In etcd v3.6.0, we introduced Kubernetes-style feature gates for managing new features. Previously, we
|
||||
indicated unstable features through the `--experimental` prefix in feature flag names. The prefix was removed
|
||||
once the feature was stable, causing a breaking change. Now, features will start in Alpha, progress
|
||||
to Beta, then GA, or get deprecated. This ensures a much smoother upgrade and downgrade experience for users.
|
||||
|
||||
See [feature-gates][] for details.
|
||||
|
||||
### livez / readyz checks {#livezreadyz-checks}
|
||||
|
||||
etcd now supports `/livez` and `/readyz` endpoints, aligning with Kubernetes' Liveness and Readiness probes.
|
||||
`/livez` indicates whether the etcd instance is alive, while `/readyz` indicates when it is ready to serve requests.
|
||||
This feature has also been backported to release-3.5 (starting from v3.5.11) and release-3.4 (starting from v3.4.29).
|
||||
See [livez/readyz][] for details.
|
||||
|
||||
The existing `/health` endpoint remains functional. `/livez` is similar to `/health?serializable=true`, while
|
||||
`/readyz` is similar to `/health` or `/health?serializable=false`. Clearly, the `/livez` and `/readyz`
|
||||
endpoints provide clearer semantics and are easier to understand.
|
||||
|
||||
### v3discovery
|
||||
|
||||
In etcd v3.6.0, the new discovery protocol [v3discovery][] was introduced, based on clientv3.
|
||||
It facilitates the discovery of all cluster members during the bootstrap phase.
|
||||
|
||||
The previous [v2discovery][] protocol, based on clientv2, has been deprecated. Additionally,
|
||||
the public discovery service at <https://discovery.etcd.io/>, which relied on v2discovery, is no longer maintained.
|
||||
|
||||
## Performance
|
||||
|
||||
### Memory
|
||||
|
||||
In this release, we reduced average memory consumption by at least 50% (see Figure 1). This improvement is primarily due to two changes:
|
||||
|
||||
- The default value of `--snapshot-count` has been reduced from 100,000 in v3.5 to 10,000 in v3.6. As a result, etcd v3.6 now retains only about 10% of the history records compared to v3.5.
|
||||
- Raft history is compacted more frequently, as introduced in [PR/18825][].
|
||||
|
||||
{{< figure src="figure-1.png" alt="Diagram of memory usage" >}}
|
||||
|
||||
_**Figure 1:** Memory usage comparison between etcd v3.5.20 and v3.6.0-rc.2 under different read/write ratios.
|
||||
Each subplot shows the memory usage over time with a specific read/write ratio. The red line represents etcd
|
||||
v3.5.20, while the teal line represents v3.6.0-rc.2. Across all tested ratios, v3.6.0-rc.2 exhibits lower and
|
||||
more stable memory usage._
|
||||
|
||||
### Throughput
|
||||
|
||||
Compared to v3.5, etcd v3.6 delivers an average performance improvement of approximately 10%
|
||||
in both read and write throughput (see Figure 2, 3, 4 and 5). This improvement is not attributed to
|
||||
any single major change, but rather the cumulative effect of multiple minor enhancements. One such
|
||||
example is the optimization of the free page queries introduced in [PR/419][].
|
||||
|
||||
{{< figure src="figure-2.png" alt="etcd read transaction performance with a high write ratio" >}}
|
||||
|
||||
_**Figure 2:** Read throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high write ratio. The
|
||||
read/write ratio is 0.0078, meaning 1 read per 128 writes. The right bar shows the percentage improvement
|
||||
in read throughput of v3.6.0-rc.2 over v3.5.20, ranging from 3.21% to 25.59%._
|
||||
|
||||
{{< figure src="figure-3.png" alt="etcd read transaction performance with a high read ratio" >}}
|
||||
|
||||
_**Figure 3:** Read throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high read ratio.
|
||||
The read/write ratio is 8, meaning 8 reads per write. The right bar shows the percentage improvement in
|
||||
read throughput of v3.6.0-rc.2 over v3.5.20, ranging from 4.38% to 27.20%._
|
||||
|
||||
{{< figure src="figure-4.png" alt="etcd write transaction performance with a high write ratio" >}}
|
||||
|
||||
_**Figure 4:** Write throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high write ratio. The
|
||||
read/write ratio is 0.0078, meaning 1 read per 128 writes. The right bar shows the percentage improvement
|
||||
in write throughput of v3.6.0-rc.2 over v3.5.20, ranging from 2.95% to 24.24%._
|
||||
|
||||
{{< figure src="figure-5.png" alt="etcd write transaction performance with a high read ratio" >}}
|
||||
|
||||
_**Figure 5:** Write throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high read ratio.
|
||||
The read/write ratio is 8, meaning 8 reads per write. The right bar shows the percentage improvement in
|
||||
write throughput of v3.6.0-rc.2 over v3.5.20, ranging from 3.86% to 28.37%._
|
||||
|
||||
## Breaking changes
|
||||
|
||||
This section highlights a few notable breaking changes. For a complete list, please refer to
|
||||
the [Upgrade etcd from v3.5 to v3.6][] and the [CHANGELOG-3.6][].
|
||||
|
||||
Old binaries are incompatible with new schema versions
|
||||
|
||||
Old etcd binaries are not compatible with newer data schema versions. For example, etcd 3.5 cannot start with
|
||||
data created by etcd 3.6, and etcd 3.4 cannot start with data created by either 3.5 or 3.6.
|
||||
|
||||
When downgrading etcd, it's important to follow the documented downgrade procedure. Simply replacing
|
||||
the binary or image will result in the incompatibility issue.
|
||||
|
||||
### Peer endpoints no longer serve client requests
|
||||
|
||||
Client endpoints (`--advertise-client-urls`) are intended to serve client requests only, while peer
|
||||
endpoints (`--initial-advertise-peer-urls`) are intended solely for peer communication. However, due to an implementation
|
||||
oversight, the peer endpoints were also able to handle client requests in etcd 3.4 and 3.5. This behavior was misleading and
|
||||
encouraged incorrect usage patterns. In etcd 3.6, this misleading behavior was corrected via [PR/13565][]; peer endpoints
|
||||
no longer serve client requests.
|
||||
|
||||
### Clear boundary between etcdctl and etcdutl
|
||||
|
||||
Both `etcdctl` and `etcdutl` are command line tools. `etcdutl` is an offline utility designed to operate directly on
|
||||
etcd data files, while `etcdctl` is an online tool that interacts with etcd over a network. Previously, there were some
|
||||
overlapping functionalities between the two, but these overlaps were removed in 3.6.0.
|
||||
|
||||
- Removed `etcdctl defrag --data-dir`
|
||||
|
||||
The `etcdctl defrag` command only support online defragmentation and no longer supports offline defragmentation.
|
||||
To perform offline defragmentation, use the `etcdutl defrag --data-dir` command instead.
|
||||
|
||||
- Removed `etcdctl snapshot status`
|
||||
|
||||
`etcdctl` no longer supports retrieving the status of a snapshot. Use the `etcdutl snapshot status` command instead.
|
||||
|
||||
- Removed `etcdctl snapshot restore`
|
||||
|
||||
`etcdctl` no longer supports restoring from a snapshot. Use the `etcdutl snapshot restore` command instead.
|
||||
|
||||
## Critical bug fixes
|
||||
|
||||
Correctness has always been a top priority for the etcd project. In the process of developing 3.6.0, we found and
|
||||
fixed a few notable bugs that could lead to data inconsistency in specific cases. These fixes have been backported
|
||||
to previous releases, but we believe they deserve special mention here.
|
||||
|
||||
- Data Inconsistency when Crashing Under Load
|
||||
|
||||
Previously, when etcd was applying data, it would update the consistent-index first, followed by committing the
|
||||
data. However, these operations were not atomic. If etcd crashed in between, it could lead to data inconsistency
|
||||
(see [issue/13766][]). The issue was introduced in v3.5.0, and fixed in v3.5.3 with [PR/13854][].
|
||||
|
||||
- Durability API guarantee broken in single node cluster
|
||||
|
||||
When a client writes data and receives a success response, the data is expected to be persisted. However, the data might
|
||||
be lost if etcd crashes immediately after sending the success response to the client. This was a legacy issue (see [issue/14370][])
|
||||
affecting all previous releases. It was addressed in v3.4.21 and v3.5.5 with [PR/14400][], and fixed in raft side in
|
||||
main branch (now release-3.6) with [PR/14413][].
|
||||
|
||||
- Revision Inconsistency when Crashing During Defragmentation
|
||||
|
||||
If etcd crashed during the defragmentation operation, upon restart, it might reapply
|
||||
some entries which had already been applied, accordingly leading to the revision inconsistency issue
|
||||
(see the discussions in [PR/14685][]). The issue was introduced in v3.5.0, and fixed in v3.5.6 with [PR/14730][].
|
||||
|
||||
## Upgrade issue
|
||||
|
||||
This section highlights a common issue [issues/19557][] in the etcd v3.5 to v3.6 upgrade that may cause the upgrade
|
||||
process to fail. For a complete upgrade guide, refer to [Upgrade etcd from v3.5 to v3.6][].
|
||||
|
||||
The issue was introduced in etcd v3.5.1, and resolved in v3.5.20.
|
||||
|
||||
**Key takeaway**: users are required to first upgrade to etcd v3.5.20 (or a higher patch version) before upgrading
|
||||
to etcd v3.6.0; otherwise, the upgrade may fail.
|
||||
|
||||
For more background and technical context, see [upgrade_from_3.5_to_3.6_issue][].
|
||||
|
||||
## Testing
|
||||
|
||||
We introduced the [Robustness testing][] to verify correctness, which has always been our top priority.
|
||||
It plays traffic of various types and volumes against an etcd cluster, concurrently injects a random
|
||||
failpoint, records all operations (including both requests and responses), and finally performs a
|
||||
linearizability check. It also verifies that the [Watch APIs][] guarantees have not been violated.
|
||||
The robustness test increases our confidence in ensuring the quality of each etcd release.
|
||||
|
||||
We have migrated most of the etcd workflow tests to Kubernetes' Prow testing infrastructure to
|
||||
take advantage of its benefit, such as nice dashboards for viewing test results and the ability
|
||||
for contributors to rerun failed tests themselves.
|
||||
|
||||
## Platforms
|
||||
|
||||
While retaining all existing supported platforms, we have promoted Linux/ARM64 to Tier 1 support.
|
||||
For more details, please refer to [issues/15951][]. For the complete list of supported platforms,
|
||||
see [supported-platform][].
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Dependency bumping guide
|
||||
|
||||
We have published an official guide on how to bump dependencies for etcd’s main branch and stable releases.
|
||||
It also covers how to update the Go version. For more details, please refer to [dependency_management][].
|
||||
With this guide available, any contributors can now help with dependency upgrades.
|
||||
|
||||
### Core Dependency Updates
|
||||
|
||||
[bbolt][] and [raft][] are two core dependencies of etcd.
|
||||
|
||||
Both etcd v3.4 and v3.5 depend on bbolt v1.3, while etcd v3.6 depends on bbolt v1.4.
|
||||
|
||||
For the release-3.4 and release-3.5 branches, raft is included in the etcd repository itself, so etcd v3.4 and v3.5
|
||||
do not depend on an external raft module. Starting from etcd v3.6, raft was moved to a separate repository ([raft][]),
|
||||
and the first standalone raft release is v3.6.0. As a result, etcd v3.6.0 depends on raft v3.6.0.
|
||||
|
||||
Please see the table below for a summary:
|
||||
|
||||
| etcd versions | bbolt versions | raft versions |
|
||||
|---------------|----------------|---------------|
|
||||
| 3.4.x | v1.3.x | N/A |
|
||||
| 3.5.x | v1.3.x | N/A |
|
||||
| 3.6.x | v1.4.x | v3.6.x |
|
||||
|
||||
### grpc-gateway@v2
|
||||
|
||||
We upgraded [grpc-gateway][] from v1 to v2 via [PR/16595][] in etcd v3.6.0. This is a major step toward
|
||||
migrating to [protobuf-go][], the second major version of the Go protocol buffer API implementation.
|
||||
|
||||
grpc-gateway@v2 is designed to work with [protobuf-go][]. However, etcd v3.6 still depends on the deprecated
|
||||
[gogo/protobuf][], which is actually protocol buffer v1 implementation. To resolve this incompatibility,
|
||||
we applied a [patch][] to the generated *.pb.gw.go files to convert v1 messages to v2 messages.
|
||||
|
||||
### grpc-ecosystem/go-grpc-middleware/providers/prometheus
|
||||
|
||||
We switched from the deprecated (and archived) [grpc-ecosystem/go-grpc-prometheus][] to
|
||||
[grpc-ecosystem/go-grpc-middleware/providers/prometheus][] via [PR/19195][]. This change ensures continued
|
||||
support and access to the latest features and improvements in the gRPC Prometheus integration.
|
||||
|
||||
## Community
|
||||
|
||||
There are exciting developments in the etcd community that reflect our ongoing commitment
|
||||
to strengthening collaboration, improving maintainability, and evolving the project’s governance.
|
||||
|
||||
### etcd Becomes a Kubernetes SIG
|
||||
|
||||
etcd has officially become a Kubernetes Special Interest Group: SIG-etcd. This change reflects
|
||||
etcd’s critical role as the primary datastore for Kubernetes and establishes a more structured
|
||||
and transparent home for long-term stewardship and cross-project collaboration. The new SIG
|
||||
designation will help streamline decision-making, align roadmaps with Kubernetes needs,
|
||||
and attract broader community involvement.
|
||||
|
||||
### New contributors, maintainers, and reviewers
|
||||
|
||||
We’ve seen increasing engagement from contributors, which has resulted in the addition of three new maintainers:
|
||||
|
||||
- [fuweid][]
|
||||
- [jmhbnz][]
|
||||
- [wenjiaswe][]
|
||||
|
||||
Their continued contributions have been instrumental in driving the project forward.
|
||||
|
||||
We also welcome two new reviewers to the project:
|
||||
|
||||
- [ivanvc][]
|
||||
- [siyuanfoundation][]
|
||||
|
||||
We appreciate their dedication to code quality and their willingness to take on broader review responsibilities
|
||||
within the community.
|
||||
|
||||
New release team
|
||||
|
||||
We've formed a new release team led by [ivanvc][] and [jmhbnz][], streamlining the release process by automating
|
||||
many previously manual steps. Inspired by Kubernetes SIG Release, we've adopted several best practices, including
|
||||
clearly defined release team roles and the introduction of release shadows to support knowledge sharing and team
|
||||
sustainability. These changes have made our releases smoother and more reliable, allowing us to approach each
|
||||
release with greater confidence and consistency.
|
||||
|
||||
### Introducing the etcd Operator Working Group
|
||||
|
||||
To further advance etcd’s operational excellence, we have formed a new working group: [WG-etcd-operator][].
|
||||
The working group is dedicated to enabling the automatic and efficient operation of etcd clusters that run in
|
||||
the Kubernetes environment using an etcd-operator.
|
||||
|
||||
## Future Development
|
||||
|
||||
The legacy v2store has been deprecated since etcd v3.4, and the flag `--enable-v2` was removed entirely in v3.6.
|
||||
This means that starting from v3.6, there is no longer a way to enable or use the v2store. However, etcd still
|
||||
bootstraps internally from the legacy v2 snapshots. To address this inconsistency, We plan to change etcd to
|
||||
bootstrap from the v3store and replay the WAL entries based on the `consistent-index`. The work is being tracked
|
||||
in [issues/12913].
|
||||
|
||||
One of the most persistent challenges remains the large range of queries from the kube-apiserver, which can
|
||||
lead to process crashes due to their unpredictable nature. The range stream feature, originally outlined in
|
||||
the [v3.5 release blog/Future roadmaps][], remains an idea worth revisiting to address the challenges of large
|
||||
range queries.
|
||||
|
||||
For more details and upcoming plans, please refer to the [etcd roadmap][].
|
||||
|
||||
[etcd v3.6.0]: https://github.com/etcd-io/etcd/releases/tag/v3.6.0
|
||||
[CHANGELOG-3.6]: https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.6.md
|
||||
[Security Release Process]: https://github.com/etcd-io/etcd/blob/main/security/security-release-process.md
|
||||
[PR 19113]: https://github.com/etcd-io/etcd/pull/19113
|
||||
[issues/12913]: https://github.com/etcd-io/etcd/issues/12913
|
||||
[issues/11716]: https://github.com/etcd-io/etcd/issues/11716
|
||||
[Downgrade-3.6]: https://etcd.io/docs/v3.6/downgrades/downgrade_3_6/
|
||||
[feature-gates]: https://etcd.io/docs/v3.6/feature-gates/
|
||||
[livez/readyz]: https://etcd.io/docs/v3.6/op-guide/monitoring/
|
||||
[v3discovery]: https://etcd.io/docs/v3.6/dev-internal/discovery_protocol/
|
||||
[v2discovery]: https://etcd.io/docs/v3.5/dev-internal/discovery_protocol/
|
||||
[Upgrade etcd from v3.5 to v3.6]: https://etcd.io/docs/v3.6/upgrades/upgrade_3_6/
|
||||
[PR/13565]: https://github.com/etcd-io/etcd/pull/13565
|
||||
[issue/13766]: https://github.com/etcd-io/etcd/issues/13766
|
||||
[PR/13854]: https://github.com/etcd-io/etcd/pull/13854
|
||||
[issue/14370]: https://github.com/etcd-io/etcd/issues/14370
|
||||
[PR/14400]: https://github.com/etcd-io/etcd/pull/14400
|
||||
[PR/14413]: https://github.com/etcd-io/etcd/pull/14413
|
||||
[PR/14685]: https://github.com/etcd-io/etcd/pull/14685
|
||||
[PR/14730]: https://github.com/etcd-io/etcd/pull/14730
|
||||
[PR/18825]: https://github.com/etcd-io/etcd/pull/18825
|
||||
[PR/419]: https://github.com/etcd-io/bbolt/pull/419
|
||||
[Robustness testing]: https://github.com/etcd-io/etcd/tree/main/tests/robustness
|
||||
[Watch APIs]: https://etcd.io/docs/v3.5/learning/api_guarantees/#watch-apis
|
||||
[issues/15951]: https://github.com/etcd-io/etcd/issues/15951
|
||||
[supported-platform]: https://etcd.io/docs/v3.6/op-guide/supported-platform/
|
||||
[dependency_management]: https://github.com/etcd-io/etcd/blob/main/Documentation/contributor-guide/dependency_management.md
|
||||
[bbolt]: https://github.com/etcd-io/bbolt
|
||||
[raft]: https://github.com/etcd-io/raft
|
||||
[grpc-gateway]: https://github.com/grpc-ecosystem/grpc-gateway
|
||||
[PR/16595]: https://github.com/etcd-io/etcd/pull/16595
|
||||
[protobuf-go]: https://github.com/protocolbuffers/protobuf-go
|
||||
[gogo/protobuf]: https://github.com/gogo/protobuf
|
||||
[patch]: https://github.com/etcd-io/etcd/blob/158b9e0d468d310c3edf4cf13f2458c51b0406fa/scripts/genproto.sh#L151-L184
|
||||
[grpc-ecosystem/go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus
|
||||
[grpc-ecosystem/go-grpc-middleware/providers/prometheus]: https://github.com/grpc-ecosystem/go-grpc-middleware/tree/main/providers/prometheus
|
||||
[PR/19195]: https://github.com/etcd-io/etcd/pull/19195
|
||||
[issues/19557]: https://github.com/etcd-io/etcd/issues/19557
|
||||
[upgrade_from_3.5_to_3.6_issue]: https://etcd.io/blog/2025/upgrade_from_3.5_to_3.6_issue/
|
||||
[WG-etcd-operator]: https://github.com/kubernetes/community/tree/master/wg-etcd-operator
|
||||
[v3.5 release blog/Future roadmaps]: https://etcd.io/blog/2021/announcing-etcd-3.5/#future-roadmaps
|
||||
[etcd roadmap]: https://github.com/etcd-io/etcd/blob/main/Documentation/contributor-guide/roadmap.md
|
||||
[fuweid]: https://github.com/fuweid
|
||||
[jmhbnz]: https://github.com/jmhbnz
|
||||
[wenjiaswe]: https://github.com/wenjiaswe
|
||||
[ivanvc]: https://github.com/ivanvc
|
||||
[siyuanfoundation]: https://github.com/siyuanfoundation
|
|
@ -0,0 +1,83 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Kubernetes 1.33: Job's SuccessPolicy Goes GA"
|
||||
date: 2025-05-15T10:30:00-08:00
|
||||
slug: kubernetes-1-33-jobs-success-policy-goes-ga
|
||||
authors: >
|
||||
[Yuki Iwai](https://github.com/tenzen-y) (CyberAgent, Inc)
|
||||
---
|
||||
|
||||
On behalf of the Kubernetes project, I'm pleased to announce that Job _success policy_ has graduated to General Availability (GA) as part of the v1.33 release.
|
||||
|
||||
## About Job's Success Policy
|
||||
|
||||
In batch workloads, you might want to use leader-follower patterns like [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface),
|
||||
in which the leader controls the execution, including the followers' lifecycle.
|
||||
|
||||
In this case, you might want to mark it as succeeded
|
||||
even if some of the indexes failed. Unfortunately, a leader-follower Kubernetes Job that didn't use a success policy, in most cases, would have to require **all** Pods to finish successfully
|
||||
for that Job to reach an overall succeeded state.
|
||||
|
||||
For Kubernetes Jobs, the API allows you to specify the early exit criteria using the `.spec.successPolicy`
|
||||
field (you can only use the `.spec.successPolicy` field for an [indexed Job](/docs/concept/workloads/controllers/job/#completion-mode)).
|
||||
Which describes a set of rules either using a list of succeeded indexes for a job, or defining a minimal required size of succeeded indexes.
|
||||
|
||||
This newly stable field is especially valuable for scientific simulation, AI/ML and High-Performance Computing (HPC) batch workloads.
|
||||
Users in these areas often run numerous experiments and may only need a specific number to complete successfully, rather than requiring all of them to succeed.
|
||||
In this case, the leader index failure is the only relevant Job exit criteria, and the outcomes for individual follower Pods are handled
|
||||
only indirectly via the status of the leader index.
|
||||
Moreover, followers do not know when they can terminate themselves.
|
||||
|
||||
After Job meets any __Success Policy__, the Job is marked as succeeded, and all Pods are terminated including the running ones.
|
||||
|
||||
## How it works
|
||||
|
||||
The following excerpt from a Job manifest, using `.successPolicy.rules[0].succeededCount`, shows an example of
|
||||
using a custom success policy:
|
||||
|
||||
```yaml
|
||||
parallelism: 10
|
||||
completions: 10
|
||||
completionMode: Indexed
|
||||
successPolicy:
|
||||
rules:
|
||||
- succeededCount: 1
|
||||
```
|
||||
|
||||
Here, the Job is marked as succeeded when one index succeeded regardless of its number.
|
||||
Additionally, you can constrain index numbers against `succeededCount` in `.successPolicy.rules[0].succeededCount`
|
||||
as shown below:
|
||||
|
||||
```yaml
|
||||
parallelism: 10
|
||||
completions: 10
|
||||
completionMode: Indexed
|
||||
successPolicy:
|
||||
rules:
|
||||
- succeededIndexes: 0 # index of the leader Pod
|
||||
succeededCount: 1
|
||||
```
|
||||
|
||||
This example shows that the Job will be marked as succeeded once a Pod with a specific index (Pod index 0) has succeeded.
|
||||
|
||||
Once the Job either reaches one of the `successPolicy` rules, or achieves its `Complete` criteria based on `.spec.completions`,
|
||||
the Job controller within kube-controller-manager adds the `SuccessCriteriaMet` condition to the Job status.
|
||||
After that, the job-controller initiates cleanup and termination of Pods for Jobs with `SuccessCriteriaMet` condition.
|
||||
Eventually, Jobs obtain `Complete` condition when the job-controller finished cleanup and termination.
|
||||
|
||||
## Learn more
|
||||
|
||||
- Read the documentation for
|
||||
[success policy](/docs/concepts/workloads/controllers/job/#success-policy).
|
||||
- Read the KEP for the [Job success/completion policy](https://github.com/kubernetes/enhancements/tree/master/keps/sig-apps/3998-job-success-completion-policy)
|
||||
|
||||
## Get involved
|
||||
|
||||
This work was led by the Kubernetes
|
||||
[batch working group](https://github.com/kubernetes/community/tree/master/wg-batch)
|
||||
in close collaboration with the
|
||||
[SIG Apps](https://github.com/kubernetes/community/tree/master/sig-apps) community.
|
||||
|
||||
If you are interested in working on new features in the space I recommend
|
||||
subscribing to our [Slack](https://kubernetes.slack.com/messages/wg-batch)
|
||||
channel and attending the regular community meetings.
|
|
@ -1,13 +1,12 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "In-Place Pod Resize Graduating to Beta"
|
||||
slug: in-place-pod-resize-beta
|
||||
draft: true
|
||||
date: XXXX-XX-XX
|
||||
title: "Kubernetes v1.33: In-Place Pod Resize Graduated to Beta"
|
||||
slug: kubernetes-v1-33-in-place-pod-resize-beta
|
||||
date: 2025-05-16T10:30:00-08:00
|
||||
author: "Tim Allclair (Google)"
|
||||
---
|
||||
|
||||
On behalf of the Kubernetes project, I am excited to announce that the **in-place Pod resize** feature (also known as In-Place Pod Vertical Scaling), first introduced as alpha in Kubernetes v1.27, is graduating to **Beta** and will be enabled by default in the Kubernetes v1.33 release! This marks a significant milestone in making resource management for Kubernetes workloads more flexible and less disruptive.
|
||||
On behalf of the Kubernetes project, I am excited to announce that the **in-place Pod resize** feature (also known as In-Place Pod Vertical Scaling), first introduced as alpha in Kubernetes v1.27, has graduated to **Beta** and will be enabled by default in the Kubernetes v1.33 release! This marks a significant milestone in making resource management for Kubernetes workloads more flexible and less disruptive.
|
||||
|
||||
## What is in-place Pod resize?
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Spotlight on Policy Working Group"
|
||||
slug: wg-policy-spotlight-2025
|
||||
draft: true
|
||||
date: 2025-05-22
|
||||
author: "Arujjwal Negi"
|
||||
---
|
||||
|
||||
In the complex world of Kubernetes, policies play a crucial role in managing and securing clusters. But have you ever wondered how these policies are developed, implemented, and standardized across the Kubernetes ecosystem? To answer that, let's put the spotlight on the Policy Working Group.
|
||||
|
||||
The Policy Working Group is dedicated to a critical mission: providing an overall architecture that encompasses both current policy-related implementations and future policy proposals in Kubernetes. Their goal is both ambitious and essential: to develop a universal policy architecture that benefits developers and end-users alike.
|
||||
|
||||
Through collaborative methods, this working group is striving to bring clarity and consistency to the often complex world of Kubernetes policies. By focusing on both existing implementations and future proposals, they're working to ensure that the policy landscape in Kubernetes remains coherent and accessible as the technology evolves.
|
||||
|
||||
In this blog post, I'll dive deeper into the work of the Policy Working Group, guided by insights from its co-chairs:
|
||||
|
||||
- [Jim Bugwadia](https://twitter.com/JimBugwadia)
|
||||
- [Poonam Lamba](https://twitter.com/poonam-lamba)
|
||||
- [Andy Suderman](https://twitter.com/sudermanjr)
|
||||
|
||||
_Interviewed by [Arujjwal Negi](https://twitter.com/arujjval)._
|
||||
|
||||
These co-chairs will explain what the Policy working group is all about.
|
||||
|
||||
## Introduction
|
||||
|
||||
**Hello, thank you for the time! Let’s start with some introductions, could you tell us a bit about yourself, your role, and how you got involved in Kubernetes?**
|
||||
|
||||
**Jim Bugwadia**: My name is Jim Bugwadia, and I am a co-founder and the CEO at Nirmata which provides solutions that automate security and compliance for cloud-native workloads. At Nirmata, we have been working with Kubernetes since it started in 2014. We initially built a Kubernetes policy engine in our commercial platform and later donated it to CNCF as the Kyverno project. I joined the CNCF Kubernetes Policy Working Group to help build and standardize various aspects of policy management for Kubernetes and later became a co-chair.
|
||||
|
||||
**Andy Suderman**: My name is Andy Suderman and I am the CTO of Fairwinds, a managed Kubernetes-as-a-Service provider. I began working with Kubernetes in 2016 building a web conferencing platform. I am an author and/or maintainer of several Kubernetes-related open-source projects such as Goldilocks, Pluto, and Polaris. Polaris is a JSON-schema-based policy engine, which started Fairwinds' journey into the policy space and my involvement in the Policy Working Group.
|
||||
|
||||
**Poonam Lamba**: My name is Poonam Lamba, and I currently work as a Product Manager for Google Kubernetes Engine (GKE) at Google. My journey with Kubernetes began back in 2017 when I was building an SRE platform for a large enterprise, using a private cloud built on Kubernetes. Intrigued by its potential to revolutionize the way we deployed and managed applications at the time, I dove headfirst into learning everything I could about it. Since then, I've had the opportunity to build the policy and compliance products for GKE. I lead and contribute to GKE CIS benchmarks. I am involved with the Gatekeeper project as well as I have contributed to Policy-WG for over 2 years currently I serve as a co-chair for K8s policy WG.
|
||||
|
||||
*Response to further questions is represented as an amalgamation of responses from co-chairs*
|
||||
|
||||
## About Working Groups
|
||||
|
||||
**One thing even I am not aware of is the difference between a working group and a SIG. Can you help us understand what a working group is and how it is different from a SIG?**
|
||||
|
||||
Unlike SIGs, working groups are temporary and focused on tackling specific, cross-cutting issues or projects that may involve multiple SIGs. Their lifespan is defined, and they disband once they've achieved their objective. Generally, working groups don't own code or have long-term responsibility for managing a particular area of the Kubernetes project.
|
||||
|
||||
(To know more about SIGs, visit the [list of Special Interest Groups](https://github.com/kubernetes/community/blob/master/sig-list.md))
|
||||
|
||||
**You mentioned that Working Groups involve multiple SIGS. What SIGS are you closely involved with, and how do you coordinate with them?**
|
||||
|
||||
We have collaborated closely with Kubernetes SIG Auth throughout our existence, and more recently, we've also been working with SIG Security since its formation. Our collaboration occurs in a few ways. We provide periodic updates during the SIG meetings to keep them informed of our progress and activities. Additionally, we utilize other community forums to maintain open lines of communication and ensure our work aligns with the broader Kubernetes ecosystem. This collaborative approach helps us stay coordinated with related efforts across the Kubernetes community.
|
||||
|
||||
## Policy WG
|
||||
|
||||
**Why was the Policy Working Group created?**
|
||||
|
||||
To enable a broad set of use cases, we recognize that Kubernetes is powered by a highly declarative, fine-grained, and extensible configuration management system. We've observed that a Kubernetes configuration manifest may have different portions that are important to various stakeholders. For example, some parts may be crucial for developers, while others might be of particular interest to security teams or address operational concerns. Given this complexity, we believe that policies governing the usage of these intricate configurations are essential for success with Kubernetes.
|
||||
|
||||
Our Policy Working Group was created specifically to research the standardization of policy definitions and related artifacts. We saw a need to bring consistency and clarity to how policies are defined and implemented across the Kubernetes ecosystem, given the diverse requirements and stakeholders involved in Kubernetes deployments.
|
||||
|
||||
**Can you give me an idea of the work you are doing right now?**
|
||||
|
||||
We're currently working on several Kubernetes policy-related projects. Our ongoing initiatives include:
|
||||
|
||||
- We're developing a Kubernetes Enhancement Proposal (KEP) for the Kubernetes Policy Reports API. This aims to standardize how policy reports are generated and consumed within the Kubernetes ecosystem.
|
||||
- We're conducting a CNCF survey to better understand policy usage in the Kubernetes space. This will help us gauge current practices and needs across the community.
|
||||
- We're writing a paper that will guide users in achieving PCI-DSS compliance for containers. This is intended to help organizations meet important security standards in their Kubernetes environments.
|
||||
- We're also working on a paper highlighting how shifting security down can benefit organizations. This focuses on the advantages of implementing security measures earlier in the development and deployment process.
|
||||
|
||||
|
||||
**Can you tell us about the main objectives of the Policy Working Group and some of your key accomplishments so far? Also, what are your plans for the future?**
|
||||
|
||||
The charter of the Policy WG is to help standardize policy management for Kubernetes and educate the community on best practices.
|
||||
|
||||
To accomplish this we have updated the Kubernetes documentation ([Policies | Kubernetes](https://kubernetes.io/docs/concepts/policy)), produced several whitepapers ([Kubernetes Policy Management](https://github.com/kubernetes/sig-security/blob/main/sig-security-docs/papers/policy/CNCF_Kubernetes_Policy_Management_WhitePaper_v1.pdf), [Kubernetes GRC](https://github.com/kubernetes/sig-security/blob/main/sig-security-docs/papers/policy_grc/Kubernetes_Policy_WG_Paper_v1_101123.pdf)), and created the Policy Reports API ([API reference](https://htmlpreview.github.io/?https://github.com/kubernetes-sigs/wg-policy-prototypes/blob/master/policy-report/docs/index.html)) which standardizes reporting across various tools. Several popular tools such as Falco, Trivy, Kyverno, kube-bench, and others support the Policy Report API. A major milestone for the Policy WG will be to help promote the Policy Reports API to a SIG-level API or find another stable home for it.
|
||||
|
||||
Beyond that, as [ValidatingAdmissionPolicy](https://kubernetes.io/docs/reference/access-authn-authz/validating-admission-policy/) and [MutatingAdmissionPolicy](https://kubernetes.io/docs/reference/access-authn-authz/mutating-admission-policy/) become GA in Kubernetes, we intend to guide and educate the community on the tradeoffs and appropriate usage patterns for these built-in API objects and other CNCF policy management solutions like OPA/Gatekeeper and Kyverno.
|
||||
|
||||
## Challenges
|
||||
|
||||
**What are some of the major challenges that the Policy Working Group is working on or has worked on?**
|
||||
|
||||
During our work in the Policy Working Group, we've encountered several challenges:
|
||||
|
||||
- One of the main issues we've faced is finding time to consistently contribute. Given that many of us have other professional commitments, it can be difficult to dedicate regular time to the working group's initiatives.
|
||||
|
||||
- Another challenge we've experienced is related to our consensus-driven model. While this approach ensures that all voices are heard, it can sometimes lead to slower decision-making processes. We value thorough discussion and agreement, but this can occasionally delay progress on our projects.
|
||||
|
||||
- We've also encountered occasional differences of opinion among group members. These situations require careful navigation to ensure that we maintain a collaborative and productive environment while addressing diverse viewpoints.
|
||||
|
||||
- Lastly, we've noticed that newcomers to the group may find it difficult to contribute effectively without consistent attendance at our meetings. The complex nature of our work often requires ongoing context, which can be challenging for those who aren't able to participate regularly.
|
||||
|
||||
**Can you tell me more about those challenges? How did you discover each one? What has the impact been? Do you have ideas or strategies about how to address them?**
|
||||
|
||||
There are no easy answers, but having more contributors and maintainers greatly helps! Overall the CNCF community is great to work with and is very welcoming to beginners. So, if folks out there are hesitating to get involved, I highly encourage them to attend a WG or SIG meeting and just listen in.
|
||||
|
||||
It often takes a few meetings to fully understand the discussions, so don't feel discouraged if you don't grasp everything right away. We've started emphasizing this point and encourage new members to review documentation as a starting point for getting involved.
|
||||
|
||||
Additionally, differences of opinion are valued and encouraged within the Policy-WG. We adhere to the CNCF core values and resolve disagreements by maintaining respect for one another. We also strive to timebox our decisions and assign clear responsibilities to keep things moving forward.
|
||||
|
||||
|
||||
## New contributors
|
||||
|
||||
**What skills are expected from new contributors and how can they get involved with the Policy Working Group?**
|
||||
|
||||
The Policy WG is ideal for anyone who is passionate about Kubernetes security, governance, and compliance and wants to help shape the future of how we build, deploy, and operate cloud-native workloads.
|
||||
|
||||
Join the mailing list as described on our community [page](https://github.com/kubernetes/community/blob/master/wg-policy/README.md) and attend one of our upcoming [community meetings](https://github.com/kubernetes/community/tree/master/wg-policy#meetings).
|
||||
|
||||
|
||||
---
|
||||
|
||||
This is where our discussion about the Policy Working Group ends. The working group, and especially the people who took part in this article, hope this gave you some insights into the group's aims and workings. Of course, this is just the tip of the iceberg. To learn more and get involved with the Policy Working Group, consider attending their meetings. You can find the schedule and join their [discussions](https://github.com/kubernetes/community/tree/master/wg-policy).
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="900px" height="250px" style="shape-rendering:geometricPrecision; text-rendering:geometricPrecision; image-rendering:optimizeQuality; fill-rule:evenodd; clip-rule:evenodd" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<g><path style="opacity:0.908" fill="#fbfcfe" d="M 134.5,17.5 C 137.85,17.335 141.183,17.5017 144.5,18C 170.5,30.3333 196.5,42.6667 222.5,55C 226.894,58.0684 229.728,62.235 231,67.5C 236.872,93.8622 242.872,120.196 249,146.5C 249.61,150.236 249.277,153.903 248,157.5C 230.333,179.833 212.667,202.167 195,224.5C 192.441,227.531 189.274,229.698 185.5,231C 154.5,231.667 123.5,231.667 92.5,231C 88.7257,229.698 85.559,227.531 83,224.5C 66.9068,203.984 50.5734,183.651 34,163.5C 27.7798,155.497 26.7798,146.83 31,137.5C 36.6667,113.167 42.3333,88.8333 48,64.5C 49.7735,59.7271 52.9402,56.2271 57.5,54C 83.2576,41.7854 108.924,29.6188 134.5,17.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#346de5" d="M 134.5,24.5 C 139.08,24.1134 143.414,24.9468 147.5,27C 171.045,38.606 194.712,49.9393 218.5,61C 222.491,63.7785 224.658,67.6119 225,72.5C 229.528,94.2768 234.528,115.943 240,137.5C 241.168,142.482 241.835,147.482 242,152.5C 241.439,154.725 240.439,156.725 239,158.5C 222.427,178.651 206.093,198.984 190,219.5C 188.269,221.617 186.102,223.117 183.5,224C 153.5,224.667 123.5,224.667 93.5,224C 73.0249,201.215 53.8582,177.382 36,152.5C 41.3608,123.356 47.6941,94.3556 55,65.5C 56.5,64 58,62.5 59.5,61C 84.8363,49.3308 109.836,37.1641 134.5,24.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#fafbfe" d="M 133.5,45.5 C 137.167,45.5 140.833,45.5 144.5,45.5C 144.5,52.8333 144.5,60.1667 144.5,67.5C 158.146,68.9079 169.979,74.2412 180,83.5C 186.083,79.5376 191.917,75.2043 197.5,70.5C 199.493,72.6655 201.327,74.9989 203,77.5C 203.749,78.635 203.583,79.635 202.5,80.5C 197.179,84.489 192.179,88.8223 187.5,93.5C 194.894,105.411 198.061,118.411 197,132.5C 198.785,133.24 200.618,133.907 202.5,134.5C 203.471,131.879 204.804,129.546 206.5,127.5C 212.363,132.529 217.697,138.029 222.5,144C 222.355,144.772 222.022,145.439 221.5,146C 214.573,148.476 207.573,150.643 200.5,152.5C 200.5,149.833 200.5,147.167 200.5,144.5C 198.208,144.756 196.041,144.423 194,143.5C 188.976,155.86 180.976,165.86 170,173.5C 170.384,176.309 171.384,178.975 173,181.5C 174.897,179.984 177.064,179.317 179.5,179.5C 178.903,187.153 178.403,194.82 178,202.5C 177.439,203.022 176.772,203.355 176,203.5C 169.677,199.182 163.344,194.848 157,190.5C 156.312,189.668 156.479,189.002 157.5,188.5C 159.332,187.752 160.999,186.752 162.5,185.5C 161.42,183.004 160.086,180.67 158.5,178.5C 145.627,183.814 132.794,183.814 120,178.5C 118.833,180.833 117.667,183.167 116.5,185.5C 117.912,186.806 119.579,187.64 121.5,188C 122.451,188.718 122.617,189.551 122,190.5C 115.505,195.521 108.671,199.854 101.5,203.5C 100.745,195.178 100.078,186.845 99.5,178.5C 101.816,179.36 104.149,179.86 106.5,180C 107.627,178.247 108.627,176.413 109.5,174.5C 97.8509,166.691 89.3509,156.358 84,143.5C 81.9592,144.423 79.7925,144.756 77.5,144.5C 77.8333,147.167 78.1667,149.833 78.5,152.5C 71.0621,150.856 63.7288,148.689 56.5,146C 55.9781,145.439 55.6448,144.772 55.5,144C 60.3409,138.232 65.6742,132.899 71.5,128C 72.3317,127.312 72.9984,127.479 73.5,128.5C 74.3094,130.071 74.6427,131.738 74.5,133.5C 76.7925,133.756 78.9592,133.423 81,132.5C 80.115,118.45 83.2817,105.45 90.5,93.5C 85.5084,88.6769 80.3418,84.0102 75,79.5C 75.7298,75.4517 77.8965,72.6183 81.5,71C 87.0109,75.1809 92.5109,79.3475 98,83.5C 108.046,74.2274 119.879,68.8941 133.5,67.5C 133.5,60.1667 133.5,52.8333 133.5,45.5 Z"/></g>
|
||||
<g><path style="opacity:0.882" fill="#000000" d="M 858.5,74.5 C 867.424,74.3534 871.257,78.6868 870,87.5C 867.185,93.1691 862.685,95.0024 856.5,93C 850.261,88.7034 849.261,83.3701 853.5,77C 855.315,76.2432 856.981,75.4098 858.5,74.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#356ee5" d="M 127.5,79.5 C 129.5,79.5 131.5,79.5 133.5,79.5C 133.666,89.1724 133.5,98.8391 133,108.5C 132.275,109.059 131.442,109.392 130.5,109.5C 122.292,104.225 114.625,98.2248 107.5,91.5C 113.265,85.9526 119.932,81.9526 127.5,79.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#356de5" d="M 144.5,79.5 C 154.716,80.2764 163.382,84.2764 170.5,91.5C 163.172,97.9916 155.672,104.325 148,110.5C 147,109.833 146,109.167 145,108.5C 144.5,98.8391 144.334,89.1724 144.5,79.5 Z"/></g>
|
||||
<g><path style="opacity:0.928" fill="#000000" d="M 423.5,83.5 C 424.833,83.5 426.167,83.5 427.5,83.5C 427.5,88.8333 427.5,94.1667 427.5,99.5C 433.833,99.5 440.167,99.5 446.5,99.5C 446.5,104.167 446.5,108.833 446.5,113.5C 440.167,113.5 433.833,113.5 427.5,113.5C 427.13,121.903 427.63,130.236 429,138.5C 430.779,140.764 433.113,142.097 436,142.5C 439.478,141.671 442.978,141.004 446.5,140.5C 446.896,144.375 447.562,148.208 448.5,152C 448.095,152.945 447.428,153.612 446.5,154C 438.116,156.922 429.782,156.922 421.5,154C 415.996,151.16 412.829,146.66 412,140.5C 411.5,122.17 411.333,103.836 411.5,85.5C 415.733,85.4613 419.733,84.7947 423.5,83.5 Z"/></g>
|
||||
<g><path style="opacity:0.918" fill="#000000" d="M 311.5,98.5 C 321.347,97.9802 331.014,98.9802 340.5,101.5C 341.921,120.529 341.754,139.529 340,158.5C 337.742,166.389 332.575,171.222 324.5,173C 314.057,175.006 303.724,174.506 293.5,171.5C 294.111,166.892 295.111,162.392 296.5,158C 303.028,159.529 309.694,160.196 316.5,160C 322.554,158.957 325.054,155.457 324,149.5C 303.472,154.648 292.305,146.648 290.5,125.5C 291.084,111.263 298.084,102.263 311.5,98.5 Z M 316.5,111.5 C 319.119,111.232 321.619,111.565 324,112.5C 324.167,116.5 324.333,120.5 324.5,124.5C 327.333,136.731 323,140.564 311.5,136C 307.355,130.681 306.522,124.848 309,118.5C 310.767,115.228 313.267,112.895 316.5,111.5 Z"/></g>
|
||||
<g><path style="opacity:0.94" fill="#000000" d="M 364.5,98.5 C 371.175,98.3337 377.842,98.5004 384.5,99C 391.702,100.869 396.202,105.369 398,112.5C 398.5,126.163 398.667,139.829 398.5,153.5C 387.249,155.423 375.916,155.923 364.5,155C 353.152,151.144 348.985,143.31 352,131.5C 354.443,125.394 358.943,121.894 365.5,121C 371.528,120.83 377.528,120.33 383.5,119.5C 382.625,115.126 379.958,112.626 375.5,112C 369.805,111.623 364.305,112.456 359,114.5C 357.414,109.983 356.58,105.316 356.5,100.5C 359.373,100.198 362.039,99.531 364.5,98.5 Z M 372.5,131.5 C 376.167,131.5 379.833,131.5 383.5,131.5C 383.5,135.167 383.5,138.833 383.5,142.5C 378.728,143.929 374.061,143.595 369.5,141.5C 366.482,136.899 367.482,133.565 372.5,131.5 Z"/></g>
|
||||
<g><path style="opacity:0.928" fill="#000000" d="M 472.5,98.5 C 497.203,96.5548 507.87,107.888 504.5,132.5C 493.167,132.5 481.833,132.5 470.5,132.5C 470.79,136.961 473.123,139.795 477.5,141C 479.847,141.436 482.181,141.936 484.5,142.5C 489.581,141.61 494.581,140.776 499.5,140C 500.861,144.362 501.528,148.862 501.5,153.5C 491.612,156.456 481.612,156.956 471.5,155C 458.543,150.518 452.543,141.352 453.5,127.5C 453.103,113.266 459.436,103.599 472.5,98.5 Z M 477.5,111.5 C 483.988,111.484 487.988,114.651 489.5,121C 483.175,121.5 476.842,121.666 470.5,121.5C 470.873,116.742 473.206,113.409 477.5,111.5 Z"/></g>
|
||||
<g><path style="opacity:0.926" fill="#000000" d="M 605.5,98.5 C 612.175,98.3337 618.842,98.5004 625.5,99C 635.288,101.791 640.122,108.291 640,118.5C 640.5,130.162 640.667,141.829 640.5,153.5C 628.91,155.397 617.243,155.897 605.5,155C 594.473,151.455 590.306,143.955 593,132.5C 595.154,125.994 599.654,122.161 606.5,121C 612.491,120.501 618.491,120.334 624.5,120.5C 624.064,115.564 621.397,112.731 616.5,112C 610.805,111.623 605.305,112.456 600,114.5C 598.627,109.928 597.794,105.261 597.5,100.5C 600.373,100.198 603.039,99.531 605.5,98.5 Z M 613.5,131.5 C 617.167,131.5 620.833,131.5 624.5,131.5C 624.5,135.167 624.5,138.833 624.5,142.5C 619.728,143.929 615.061,143.595 610.5,141.5C 607.462,136.989 608.462,133.656 613.5,131.5 Z"/></g>
|
||||
<g><path style="opacity:0.925" fill="#000000" d="M 742.5,98.5 C 749.175,98.3337 755.842,98.5004 762.5,99C 771.815,101.649 776.649,107.816 777,117.5C 777.5,129.495 777.667,141.495 777.5,153.5C 766.244,155.386 754.911,155.886 743.5,155C 731.751,152.02 727.251,144.52 730,132.5C 732.154,125.994 736.654,122.161 743.5,121C 749.491,120.501 755.491,120.334 761.5,120.5C 761.064,115.564 758.397,112.731 753.5,112C 747.826,111.696 742.326,112.529 737,114.5C 735.627,109.928 734.794,105.261 734.5,100.5C 737.373,100.198 740.039,99.531 742.5,98.5 Z M 750.5,131.5 C 754.167,131.5 757.833,131.5 761.5,131.5C 761.5,135.167 761.5,138.833 761.5,142.5C 757.128,143.885 752.795,143.718 748.5,142C 744.299,137.629 744.966,134.129 750.5,131.5 Z"/></g>
|
||||
<g><path style="opacity:0.945" fill="#000000" d="M 802.5,98.5 C 832.848,95.8694 845.348,109.536 840,139.5C 837.5,147.333 832.333,152.5 824.5,155C 818.472,155.641 812.472,155.474 806.5,154.5C 806.5,160.833 806.5,167.167 806.5,173.5C 801.167,173.5 795.833,173.5 790.5,173.5C 790.333,149.498 790.5,125.498 791,101.5C 794.917,100.439 798.751,99.4392 802.5,98.5 Z M 806.5,112.5 C 818.841,110.485 824.841,115.652 824.5,128C 824.34,140.262 818.34,144.429 806.5,140.5C 806.5,131.167 806.5,121.833 806.5,112.5 Z"/></g>
|
||||
<g><path style="opacity:0.919" fill="#000000" d="M 509.5,99.5 C 515.5,99.5 521.5,99.5 527.5,99.5C 529.363,110.955 531.863,122.288 535,133.5C 538.352,122.28 541.186,110.947 543.5,99.5C 547.833,99.5 552.167,99.5 556.5,99.5C 558.225,110.401 560.892,121.068 564.5,131.5C 567.793,120.994 570.46,110.328 572.5,99.5C 578.167,99.5 583.833,99.5 589.5,99.5C 584.799,118.104 578.799,136.271 571.5,154C 567.129,154.828 562.795,154.661 558.5,153.5C 555.493,144.813 552.493,136.146 549.5,127.5C 546.671,136.14 543.838,144.806 541,153.5C 536.55,154.8 532.05,154.8 527.5,153.5C 520.497,135.824 514.497,117.824 509.5,99.5 Z"/></g>
|
||||
<g><path style="opacity:0.917" fill="#000000" d="M 645.5,99.5 C 651.425,99.1918 657.259,99.5251 663,100.5C 665.869,111.773 669.536,122.773 674,133.5C 677.886,122.345 681.053,111.011 683.5,99.5C 689.167,99.5 694.833,99.5 700.5,99.5C 694.611,121.996 686.445,143.663 676,164.5C 669.118,173.048 660.284,175.881 649.5,173C 647.616,172.784 645.949,172.117 644.5,171C 645.942,166.959 646.942,162.792 647.5,158.5C 651.796,159.463 656.129,159.629 660.5,159C 662.958,157.213 664.624,154.879 665.5,152C 657.154,135.128 650.488,117.628 645.5,99.5 Z"/></g>
|
||||
<g><path style="opacity:0.95" fill="#000000" d="M 852.5,99.5 C 857.833,99.5 863.167,99.5 868.5,99.5C 868.5,117.833 868.5,136.167 868.5,154.5C 863.167,154.5 857.833,154.5 852.5,154.5C 852.5,136.167 852.5,117.833 852.5,99.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#386ee5" d="M 99.5,100.5 C 107.134,105.665 114.468,111.332 121.5,117.5C 122.833,119.167 122.833,120.833 121.5,122.5C 112.581,125.153 103.581,127.486 94.5,129.5C 92.1812,119.117 93.8478,109.45 99.5,100.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#386fe5" d="M 177.5,100.5 C 184.058,109.086 186.058,118.752 183.5,129.5C 174.476,127.494 165.476,125.328 156.5,123C 155.24,121.186 155.24,119.353 156.5,117.5C 163.753,112.054 170.753,106.387 177.5,100.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#4173e6" d="M 135.5,116.5 C 141.755,115.261 145.422,117.761 146.5,124C 144.602,131.278 140.269,133.111 133.5,129.5C 130.544,124.611 131.211,120.278 135.5,116.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#386fe5" d="M 120.5,134.5 C 122.5,134.5 124.5,134.5 126.5,134.5C 123.684,144.464 119.517,153.797 114,162.5C 105.956,157.595 100.123,150.762 96.5,142C 96.9054,141.055 97.572,140.388 98.5,140C 105.962,138.134 113.295,136.301 120.5,134.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#386ee5" d="M 152.5,133.5 C 161.379,136.092 170.379,138.259 179.5,140C 180.428,140.388 181.095,141.055 181.5,142C 178.209,150.792 172.542,157.626 164.5,162.5C 159.86,154.421 155.693,146.087 152,137.5C 151.421,136.072 151.588,134.738 152.5,133.5 Z"/></g>
|
||||
<g><path style="opacity:1" fill="#376ee5" d="M 136.5,141.5 C 138.604,141.201 140.604,141.534 142.5,142.5C 146.737,150.968 150.403,159.635 153.5,168.5C 148.384,169.489 143.218,170.156 138,170.5C 133.215,170.678 128.715,169.678 124.5,167.5C 129.059,159.051 133.059,150.384 136.5,141.5 Z"/></g>
|
||||
</svg>
|
After Width: | Height: | Size: 12 KiB |
|
@ -0,0 +1,395 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Gateway API v1.3.0: Advancements in Request Mirroring, CORS, Gateway Merging, and Retry Budgets"
|
||||
date: 2025-06-02T09:00:00-08:00
|
||||
draft: false
|
||||
slug: gateway-api-v1-3
|
||||
author: >
|
||||
[Candace Holman](https://github.com/candita) (Red Hat)
|
||||
---
|
||||
|
||||

|
||||
|
||||
Join us in the Kubernetes SIG Network community in celebrating the general
|
||||
availability of [Gateway API](https://gateway-api.sigs.k8s.io/) v1.3.0! We are
|
||||
also pleased to announce that there are already a number of conformant
|
||||
implementations to try, made possible by postponing this blog
|
||||
announcement. Version 1.3.0 of the API was released about a month ago on
|
||||
April 24, 2025.
|
||||
|
||||
Gateway API v1.3.0 brings a new feature to the _Standard_ channel
|
||||
(Gateway API's GA release channel): _percentage-based request mirroring_, and
|
||||
introduces three new experimental features: cross-origin resource sharing (CORS)
|
||||
filters, a standardized mechanism for listener and gateway merging, and retry
|
||||
budgets.
|
||||
|
||||
Also see the full
|
||||
[release notes](https://github.com/kubernetes-sigs/gateway-api/blob/54df0a899c1c5c845dd3a80f05dcfdf65576f03c/CHANGELOG/1.3-CHANGELOG.md)
|
||||
and applaud the
|
||||
[v1.3.0 release team](https://github.com/kubernetes-sigs/gateway-api/blob/54df0a899c1c5c845dd3a80f05dcfdf65576f03c/CHANGELOG/1.3-TEAM.md)
|
||||
next time you see them.
|
||||
|
||||
## Graduation to Standard channel
|
||||
|
||||
Graduation to the Standard channel is a notable achievement for Gateway API
|
||||
features, as inclusion in the Standard release channel denotes a high level of
|
||||
confidence in the API surface and provides guarantees of backward compatibility.
|
||||
Of course, as with any other Kubernetes API, Standard channel features can continue
|
||||
to evolve with backward-compatible additions over time, and we (SIG Network)
|
||||
certainly expect
|
||||
further refinements and improvements in the future. For more information on how
|
||||
all of this works, refer to the [Gateway API Versioning Policy](https://gateway-api.sigs.k8s.io/concepts/versioning/).
|
||||
|
||||
### Percentage-based request mirroring
|
||||
Leads: [Lior Lieberman](https://github.com/LiorLieberman),[Jake Bennert](https://github.com/jakebennert)
|
||||
|
||||
GEP-3171: [Percentage-Based Request Mirroring](https://github.com/kubernetes-sigs/gateway-api/blob/main/geps/gep-3171/index.md)
|
||||
|
||||
_Percentage-based request mirroring_ is an enhancement to the
|
||||
existing support for [HTTP request mirroring](https://gateway-api.sigs.k8s.io/guides/http-request-mirroring/), which allows HTTP requests to be duplicated to another backend using the
|
||||
RequestMirror filter type. Request mirroring is particularly useful in
|
||||
blue-green deployment. It can be used to assess the impact of request scaling on
|
||||
application performance without impacting responses to clients.
|
||||
|
||||
The previous mirroring capability worked on all the requests to a `backendRef`.
|
||||
Percentage-based request mirroring allows users to specify a subset of requests
|
||||
they want to be mirrored, either by percentage or fraction. This can be
|
||||
particularly useful when services are receiving a large volume of requests.
|
||||
Instead of mirroring all of those requests, this new feature can be used to
|
||||
mirror a smaller subset of them.
|
||||
|
||||
Here's an example with 42% of the requests to "foo-v1" being mirrored to "foo-v2":
|
||||
|
||||
```yaml
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: HTTPRoute
|
||||
metadata:
|
||||
name: http-filter-mirror
|
||||
labels:
|
||||
gateway: mirror-gateway
|
||||
spec:
|
||||
parentRefs:
|
||||
- name: mirror-gateway
|
||||
hostnames:
|
||||
- mirror.example
|
||||
rules:
|
||||
- backendRefs:
|
||||
- name: foo-v1
|
||||
port: 8080
|
||||
filters:
|
||||
- type: RequestMirror
|
||||
requestMirror:
|
||||
backendRef:
|
||||
name: foo-v2
|
||||
port: 8080
|
||||
percent: 42 # This value must be an integer.
|
||||
```
|
||||
You can also configure the partial mirroring using a fraction. Here is an example
|
||||
with 5 out of every 1000 requests to "foo-v1" being mirrored to "foo-v2".
|
||||
|
||||
```yaml
|
||||
rules:
|
||||
- backendRefs:
|
||||
- name: foo-v1
|
||||
port: 8080
|
||||
filters:
|
||||
- type: RequestMirror
|
||||
requestMirror:
|
||||
backendRef:
|
||||
name: foo-v2
|
||||
port: 8080
|
||||
fraction:
|
||||
numerator: 5
|
||||
denominator: 1000
|
||||
```
|
||||
|
||||
## Additions to Experimental channel
|
||||
|
||||
The Experimental channel is Gateway API's channel for experimenting with new
|
||||
features and gaining confidence with them before allowing them to graduate to
|
||||
standard. Please note: the experimental channel may include features that are
|
||||
changed or removed later.
|
||||
|
||||
Starting in release v1.3.0, in an effort to distinguish Experimental channel
|
||||
resources from Standard channel resources, any new experimental API kinds have the
|
||||
prefix "**X**". For the same reason, experimental resources are now added to the
|
||||
API group `gateway.networking.x-k8s.io` instead of `gateway.networking.k8s.io`.
|
||||
Bear in mind that using new experimental channel resources means they can coexist
|
||||
with standard channel resources, but migrating these resources to the standard
|
||||
channel will require recreating them with the standard channel names and API
|
||||
group (both of which lack the "x-k8s" designator or "X" prefix).
|
||||
|
||||
The v1.3 release introduces two new experimental API kinds: XBackendTrafficPolicy
|
||||
and XListenerSet. To be able to use experimental API kinds, you need to install
|
||||
the Experimental channel Gateway API YAMLs from the locations listed below.
|
||||
|
||||
### CORS filtering
|
||||
Leads: [Liang Li](https://github.com/liangli), [Eyal Pazz](https://github.com/EyalPazz), [Rob Scott](https://github.com/robscott)
|
||||
|
||||
GEP-1767: [CORS Filter](https://github.com/kubernetes-sigs/gateway-api/blob/main/geps/gep-1767/index.md)
|
||||
|
||||
Cross-origin resource sharing (CORS) is an HTTP-header based mechanism that allows
|
||||
a web page to access restricted resources from a server on an origin (domain,
|
||||
scheme, or port) different from the domain that served the web page. This feature
|
||||
adds a new HTTPRoute `filter` type, called "CORS", to configure the handling of
|
||||
cross-origin requests before the response is sent back to the client.
|
||||
|
||||
To be able to use experimental CORS filtering, you need to install the
|
||||
[Experimental channel Gateway API HTTPRoute yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.k8s.io_httproutes.yaml).
|
||||
|
||||
Here's an example of a simple cross-origin configuration:
|
||||
```yaml
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: HTTPRoute
|
||||
metadata:
|
||||
name: http-route-cors
|
||||
spec:
|
||||
parentRefs:
|
||||
- name: http-gateway
|
||||
rules:
|
||||
- matches:
|
||||
- path:
|
||||
type: PathPrefix
|
||||
value: /resource/foo
|
||||
filters:
|
||||
- cors:
|
||||
- type: CORS
|
||||
allowOrigins:
|
||||
- *
|
||||
allowMethods:
|
||||
- GET
|
||||
- HEAD
|
||||
- POST
|
||||
allowHeaders:
|
||||
- Accept
|
||||
- Accept-Language
|
||||
- Content-Language
|
||||
- Content-Type
|
||||
- Range
|
||||
backendRefs:
|
||||
- kind: Service
|
||||
name: http-route-cors
|
||||
port: 80
|
||||
```
|
||||
In this case, the Gateway returns an _origin header_ of "*", which means that the
|
||||
requested resource can be referenced from any origin, a _methods header_
|
||||
(`Access-Control-Allow-Methods`) that permits the `GET`, `HEAD`, and `POST`
|
||||
verbs, and a _headers header_ allowing `Accept`, `Accept-Language`,
|
||||
`Content-Language`, `Content-Type`, and `Range`.
|
||||
|
||||
```text
|
||||
HTTP/1.1 200 OK
|
||||
Access-Control-Allow-Origin: *
|
||||
Access-Control-Allow-Methods: GET, HEAD, POST
|
||||
Access-Control-Allow-Headers: Accept,Accept-Language,Content-Language,Content-Type,Range
|
||||
```
|
||||
The complete list of fields in the new CORS filter:
|
||||
* `allowOrigins`
|
||||
* `allowMethods`
|
||||
* `allowHeaders`
|
||||
* `allowCredentials`
|
||||
* `exposeHeaders`
|
||||
* `maxAge`
|
||||
|
||||
See [CORS protocol](https://fetch.spec.whatwg.org/#http-cors-protocol) for details.
|
||||
|
||||
### XListenerSets (standardized mechanism for Listener and Gateway merging){#XListenerSet}
|
||||
Lead: [Dave Protasowski](https://github.com/dprotaso)
|
||||
|
||||
GEP-1713: [ListenerSets - Standard Mechanism to Merge Multiple Gateways](https://github.com/kubernetes-sigs/gateway-api/pull/3213)
|
||||
|
||||
This release adds a new experimental API kind, XListenerSet, that allows a
|
||||
shared list of _listeners_ to be attached to one or more parent Gateway(s). In
|
||||
addition, it expands upon the existing suggestion that Gateway API implementations
|
||||
may merge configuration from multiple Gateway objects. It also:
|
||||
|
||||
- adds a new field `allowedListeners` to the `.spec` of a Gateway. The
|
||||
`allowedListeners` field defines from which Namespaces to select XListenerSets
|
||||
that are allowed to attach to that Gateway: Same, All, None, or Selector based.
|
||||
- increases the previous maximum number (64) of listeners with the addition of
|
||||
XListenerSets.
|
||||
- allows the delegation of listener configuration, such as TLS, to applications in
|
||||
other namespaces.
|
||||
|
||||
To be able to use experimental XListenerSet, you need to install the
|
||||
[Experimental channel Gateway API XListenerSet yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.x-k8s.io_xlistenersets.yaml).
|
||||
|
||||
The following example shows a Gateway with an HTTP listener and two child HTTPS
|
||||
XListenerSets with unique hostnames and certificates. The combined set of listeners
|
||||
attached to the Gateway includes the two additional HTTPS listeners in the
|
||||
XListenerSets that attach to the Gateway. This example illustrates the
|
||||
delegation of listener TLS config to application owners in different namespaces
|
||||
("store" and "app"). The HTTPRoute has both the Gateway listener named "foo" and
|
||||
one XListenerSet listener named "second" as `parentRefs`.
|
||||
|
||||
```yaml
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: Gateway
|
||||
metadata:
|
||||
name: prod-external
|
||||
namespace: infra
|
||||
spec:
|
||||
gatewayClassName: example
|
||||
allowedListeners:
|
||||
- from: All
|
||||
listeners:
|
||||
- name: foo
|
||||
hostname: foo.com
|
||||
protocol: HTTP
|
||||
port: 80
|
||||
---
|
||||
apiVersion: gateway.networking.x-k8s.io/v1alpha1
|
||||
kind: XListenerSet
|
||||
metadata:
|
||||
name: store
|
||||
namespace: store
|
||||
spec:
|
||||
parentRef:
|
||||
name: prod-external
|
||||
listeners:
|
||||
- name: first
|
||||
hostname: first.foo.com
|
||||
protocol: HTTPS
|
||||
port: 443
|
||||
tls:
|
||||
mode: Terminate
|
||||
certificateRefs:
|
||||
- kind: Secret
|
||||
group: ""
|
||||
name: first-workload-cert
|
||||
---
|
||||
apiVersion: gateway.networking.x-k8s.io/v1alpha1
|
||||
kind: XListenerSet
|
||||
metadata:
|
||||
name: app
|
||||
namespace: app
|
||||
spec:
|
||||
parentRef:
|
||||
name: prod-external
|
||||
listeners:
|
||||
- name: second
|
||||
hostname: second.foo.com
|
||||
protocol: HTTPS
|
||||
port: 443
|
||||
tls:
|
||||
mode: Terminate
|
||||
certificateRefs:
|
||||
- kind: Secret
|
||||
group: ""
|
||||
name: second-workload-cert
|
||||
---
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: HTTPRoute
|
||||
metadata:
|
||||
name: httproute-example
|
||||
spec:
|
||||
parentRefs:
|
||||
- name: app
|
||||
kind: XListenerSet
|
||||
sectionName: second
|
||||
- name: parent-gateway
|
||||
kind: Gateway
|
||||
sectionName: foo
|
||||
...
|
||||
```
|
||||
Each listener in a Gateway must have a unique combination of `port`, `protocol`,
|
||||
(and `hostname` if supported by the protocol) in order for all listeners to be
|
||||
**compatible** and not conflicted over which traffic they should receive.
|
||||
|
||||
Furthermore, implementations can _merge_ separate Gateways into a single set of
|
||||
listener addresses if all listeners across those Gateways are compatible. The
|
||||
management of merged listeners was under-specified in releases prior to v1.3.0.
|
||||
|
||||
With the new feature, the specification on merging is expanded. Implementations
|
||||
must treat the parent Gateways as having the merged list of all listeners from
|
||||
itself and from attached XListenerSets, and validation of this list of listeners
|
||||
must behave the same as if the list were part of a single Gateway. Within a single
|
||||
Gateway, listeners are ordered using the following precedence:
|
||||
|
||||
1. Single Listeners (not a part of an XListenerSet) first,
|
||||
2. Remaining listeners ordered by:
|
||||
- object creation time (oldest first), and if two listeners are defined in
|
||||
objects that have the same timestamp, then
|
||||
- alphabetically based on "{namespace}/{name of listener}"
|
||||
|
||||
### Retry budgets (XBackendTrafficPolicy) {#XBackendTrafficPolicy}
|
||||
Leads: [Eric Bishop](https://github.com/ericdbishop), [Mike Morris](https://github.com/mikemorris)
|
||||
|
||||
GEP-3388: [Retry Budgets](https://gateway-api.sigs.k8s.io/geps/gep-3388)
|
||||
|
||||
This feature allows you to configure a _retry budget_ across all endpoints
|
||||
of a destination Service. This is used to limit additional client-side retries
|
||||
after reaching a configured threshold. When configuring the budget, the maximum
|
||||
percentage of active requests that may consist of retries may be specified, as well as
|
||||
the interval over which requests will be considered when calculating the threshold
|
||||
for retries. The development of this specification changed the existing
|
||||
experimental API kind BackendLBPolicy into a new experimental API kind,
|
||||
XBackendTrafficPolicy, in the interest of reducing the proliferation of policy
|
||||
resources that had commonalities.
|
||||
|
||||
To be able to use experimental retry budgets, you need to install the
|
||||
[Experimental channel Gateway API XBackendTrafficPolicy yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.x-k8s.io_xbackendtrafficpolicies.yaml).
|
||||
|
||||
The following example shows an XBackendTrafficPolicy that applies a
|
||||
`retryConstraint` that represents a budget that limits the retries to a maximum
|
||||
of 20% of requests, over a duration of 10 seconds, and to a minimum of 3 retries
|
||||
over 1 second.
|
||||
|
||||
```yaml
|
||||
apiVersion: gateway.networking.x-k8s.io/v1alpha1
|
||||
kind: XBackendTrafficPolicy
|
||||
metadata:
|
||||
name: traffic-policy-example
|
||||
spec:
|
||||
retryConstraint:
|
||||
budget:
|
||||
percent: 20
|
||||
interval: 10s
|
||||
minRetryRate:
|
||||
count: 3
|
||||
interval: 1s
|
||||
...
|
||||
```
|
||||
|
||||
## Try it out
|
||||
|
||||
Unlike other Kubernetes APIs, you don't need to upgrade to the latest version of
|
||||
Kubernetes to get the latest version of Gateway API. As long as you're running
|
||||
Kubernetes 1.26 or later, you'll be able to get up and running with this version
|
||||
of Gateway API.
|
||||
|
||||
To try out the API, follow the [Getting Started Guide](https://gateway-api.sigs.k8s.io/guides/).
|
||||
As of this writing, four implementations are already conformant with Gateway API
|
||||
v1.3 experimental channel features. In alphabetical order:
|
||||
|
||||
- [Airlock Microgateway 4.6](https://github.com/airlock/microgateway/releases/tag/4.6.0)
|
||||
- [Cilium main](https://github.com/cilium/cilium)
|
||||
- [Envoy Gateway v1.4.0](https://github.com/envoyproxy/gateway/releases/tag/v1.4.0)
|
||||
- [Istio 1.27-dev](https://istio.io)
|
||||
|
||||
## Get involved
|
||||
|
||||
Wondering when a feature will be added? There are lots of opportunities to get
|
||||
involved and help define the future of Kubernetes routing APIs for both ingress
|
||||
and service mesh.
|
||||
|
||||
* Check out the [user guides](https://gateway-api.sigs.k8s.io/guides) to see what use-cases can be addressed.
|
||||
* Try out one of the [existing Gateway controllers](https://gateway-api.sigs.k8s.io/implementations/).
|
||||
* Or [join us in the community](https://gateway-api.sigs.k8s.io/contributing/)
|
||||
and help us build the future of Gateway API together!
|
||||
|
||||
The maintainers would like to thank _everyone_ who's contributed to Gateway
|
||||
API, whether in the form of commits to the repo, discussion, ideas, or general
|
||||
support. We could never have made this kind of progress without the support of
|
||||
this dedicated and active community.
|
||||
|
||||
## Related Kubernetes blog articles
|
||||
|
||||
* [Gateway API v1.2: WebSockets, Timeouts, Retries, and More](/blog/2024/11/21/gateway-api-v1-2/)
|
||||
(November 2024)
|
||||
* [Gateway API v1.1: Service mesh, GRPCRoute, and a whole lot more](/blog/2024/05/09/gateway-api-v1-1/)
|
||||
(May 2024)
|
||||
* [New Experimental Features in Gateway API v1.0](/blog/2023/11/28/gateway-api-ga/)
|
||||
(November 2023)
|
||||
* [Gateway API v1.0: GA Release](/blog/2023/10/31/gateway-api-ga/)
|
||||
(October 2023)
|
|
@ -0,0 +1,234 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Start Sidecar First: How To Avoid Snags"
|
||||
date: 2025-06-03
|
||||
draft: false
|
||||
slug: start-sidecar-first
|
||||
author: Agata Skorupka (The Scale Factory)
|
||||
---
|
||||
|
||||
From the [Kubernetes Multicontainer Pods: An Overview blog post](/blog/2025/04/22/multi-container-pods-overview/) you know what their job is, what are the main architectural patterns, and how they are implemented in Kubernetes. The main thing I’ll cover in this article is how to ensure that your sidecar containers start before the main app. It’s more complicated than you might think!
|
||||
|
||||
## A gentle refresher
|
||||
|
||||
I'd just like to remind readers that the [v1.29.0 release of Kubernetes](/blog/2023/12/13/kubernetes-v1-29-release/) added native support for
|
||||
[sidecar containers](/docs/concepts/workloads/pods/sidecar-containers/), which can now be defined within the `.spec.initContainers` field,
|
||||
but with `restartPolicy: Always`. You can see that illustrated in the following example Pod manifest snippet:
|
||||
|
||||
```yaml
|
||||
initContainers:
|
||||
- name: logshipper
|
||||
image: alpine:latest
|
||||
restartPolicy: Always # this is what makes it a sidecar container
|
||||
command: ['sh', '-c', 'tail -F /opt/logs.txt']
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /opt
|
||||
```
|
||||
|
||||
What are the specifics of defining sidecars with a `.spec.initContainers` block, rather than as a legacy multi-container pod with multiple `.spec.containers`?
|
||||
Well, all `.spec.initContainers` are always launched **before** the main application. If you define Kubernetes-native sidecars, those are terminated **after** the main application. Furthermore, when used with [Jobs](/docs/concepts/workloads/controllers/job/), a sidecar container should still be alive and could potentially even restart after the owning Job is complete; Kubernetes-native sidecar containers do not block pod completion.
|
||||
|
||||
To learn more, you can also read the official [Pod sidecar containers tutorial](/docs/tutorials/configuration/pod-sidecar-containers/).
|
||||
|
||||
## The problem
|
||||
|
||||
Now you know that defining a sidecar with this native approach will always start it before the main application. From the [kubelet source code](https://github.com/kubernetes/kubernetes/blob/537a602195efdc04cdf2cb0368792afad082d9fd/pkg/kubelet/kuberuntime/kuberuntime_manager.go#L827-L830), it's visible that this often means being started almost in parallel, and this is not always what an engineer wants to achieve. What I'm really interested in is whether I can delay the start of the main application until the sidecar is not just started, but fully running and ready to serve.
|
||||
It might be a bit tricky because the problem with sidecars is there’s no obvious success signal, contrary to init containers - designed to run only for a specified period of time. With an init container, exit status 0 is unambiguously "I succeeded". With a sidecar, there are lots of points at which you can say "a thing is running".
|
||||
Starting one container only after the previous one is ready is part of a graceful deployment strategy, ensuring proper sequencing and stability during startup. It’s also actually how I’d expect sidecar containers to work as well, to cover the scenario where the main application is dependent on the sidecar. For example, it may happen that an app errors out if the sidecar isn’t available to serve requests (e.g., logging with DataDog). Sure, one could change the application code (and it would actually be the “best practice” solution), but sometimes they can’t - and this post focuses on this use case.
|
||||
|
||||
I'll explain some ways that you might try, and show you what approaches will really work.
|
||||
|
||||
## Readiness probe
|
||||
|
||||
To check whether Kubernetes native sidecar delays the start of the main application until the sidecar is ready, let’s simulate a short investigation. Firstly, I’ll simulate a sidecar container which will never be ready by implementing a readiness probe which will never succeed. As a reminder, a [readiness probe](/docs/concepts/configuration/liveness-readiness-startup-probes/) checks if the container is ready to start accepting traffic and therefore, if the pod can be used as a backend for services.
|
||||
|
||||
(Unlike standard init containers, sidecar containers can have [probes](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/) so that the kubelet can supervise the sidecar and intervene if there are problems. For example, restarting a sidecar container if it fails a health check.)
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: myapp
|
||||
labels:
|
||||
app: myapp
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: myapp
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: myapp
|
||||
spec:
|
||||
containers:
|
||||
- name: myapp
|
||||
image: alpine:latest
|
||||
command: ["sh", "-c", "sleep 3600"]
|
||||
initContainers:
|
||||
- name: nginx
|
||||
image: nginx:latest
|
||||
restartPolicy: Always
|
||||
ports:
|
||||
- containerPort: 80
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- exit 1 # this command always fails, keeping the container "Not Ready"
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
```
|
||||
|
||||
The result is:
|
||||
|
||||
```console
|
||||
controlplane $ kubectl get pods -w
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
myapp-db5474f45-htgw5 1/2 Running 0 9m28s
|
||||
|
||||
controlplane $ kubectl describe pod myapp-db5474f45-htgw5
|
||||
Name: myapp-db5474f45-htgw5
|
||||
Namespace: default
|
||||
(...)
|
||||
Events:
|
||||
Type Reason Age From Message
|
||||
---- ------ ---- ---- -------
|
||||
Normal Scheduled 17s default-scheduler Successfully assigned default/myapp-db5474f45-htgw5 to node01
|
||||
Normal Pulling 16s kubelet Pulling image "nginx:latest"
|
||||
Normal Pulled 16s kubelet Successfully pulled image "nginx:latest" in 163ms (163ms including waiting). Image size: 72080558 bytes.
|
||||
Normal Created 16s kubelet Created container nginx
|
||||
Normal Started 16s kubelet Started container nginx
|
||||
Normal Pulling 15s kubelet Pulling image "alpine:latest"
|
||||
Normal Pulled 15s kubelet Successfully pulled image "alpine:latest" in 159ms (160ms including waiting). Image size: 3652536 bytes.
|
||||
Normal Created 15s kubelet Created container myapp
|
||||
Normal Started 15s kubelet Started container myapp
|
||||
Warning Unhealthy 1s (x6 over 15s) kubelet Readiness probe failed:
|
||||
```
|
||||
|
||||
From these logs it’s evident that only one container is ready - and I know it can’t be the sidecar, because I’ve defined it so it’ll never be ready (you can also check container statuses in `kubectl get pod -o json`). I also saw that myapp has been started before the sidecar is ready. That was not the result I wanted to achieve; in this case, the main app container has a hard dependency on its sidecar.
|
||||
|
||||
## Maybe a startup probe?
|
||||
|
||||
To ensure that the sidecar is ready before the main app container starts, I can define a `startupProbe`. It will delay the start of the main container until the command is successfully executed (returns `0` exit status). If you’re wondering why I’ve added it to my `initContainer`, let’s analyse what happens If I’d added it to myapp container. I wouldn’t have guaranteed the probe would run before the main application code - and this one, can potentially error out without the sidecar being up and running.
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: myapp
|
||||
labels:
|
||||
app: myapp
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: myapp
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: myapp
|
||||
spec:
|
||||
containers:
|
||||
- name: myapp
|
||||
image: alpine:latest
|
||||
command: ["sh", "-c", "sleep 3600"]
|
||||
initContainers:
|
||||
- name: nginx
|
||||
image: nginx:latest
|
||||
ports:
|
||||
- containerPort: 80
|
||||
protocol: TCP
|
||||
restartPolicy: Always
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 80
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 30
|
||||
failureThreshold: 10
|
||||
timeoutSeconds: 20
|
||||
volumes:
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
```
|
||||
|
||||
This results in 2/2 containers being ready and running, and from events, it can be inferred that the main application started only after nginx had already been started. But to confirm whether it waited for the sidecar readiness, let’s change the `startupProbe` to the exec type of command:
|
||||
|
||||
```yaml
|
||||
startupProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- sleep 15
|
||||
```
|
||||
|
||||
and run `kubectl get pods -w` to watch in real time whether the readiness of both containers only changes after a 15 second delay. Again, events confirm the main application starts after the sidecar.
|
||||
That means that using the `startupProbe` with a correct `startupProbe.httpGet` request helps to delay the main application start until the sidecar is ready. It’s not optimal, but it works.
|
||||
|
||||
## What about the postStart lifecycle hook?
|
||||
|
||||
Fun fact: using the `postStart` lifecycle hook block will also do the job, but I’d have to write my own mini-shell script, which is even less efficient.
|
||||
|
||||
```yaml
|
||||
initContainers:
|
||||
- name: nginx
|
||||
image: nginx:latest
|
||||
restartPolicy: Always
|
||||
ports:
|
||||
- containerPort: 80
|
||||
protocol: TCP
|
||||
lifecycle:
|
||||
postStart:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Waiting for readiness at http://localhost:80"
|
||||
until curl -sf http://localhost:80; do
|
||||
echo "Still waiting for http://localhost:80..."
|
||||
sleep 5
|
||||
done
|
||||
echo "Service is ready at http://localhost:80"
|
||||
```
|
||||
|
||||
## Liveness probe
|
||||
|
||||
An interesting exercise would be to check the sidecar container behavior with a [liveness probe](/docs/concepts/configuration/liveness-readiness-startup-probes/).
|
||||
A liveness probe behaves and is configured similarly to a readiness probe - only with the difference that it doesn’t affect the readiness of the container but restarts it in case the probe fails.
|
||||
|
||||
```yaml
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- exit 1 # this command always fails, keeping the container "Not Ready"
|
||||
periodSeconds: 5
|
||||
```
|
||||
|
||||
After adding the liveness probe configured just as the previous readiness probe and checking events of the pod by `kubectl describe pod` it’s visible that the sidecar has a restart count above 0. Nevertheless, the main application is not restarted nor influenced at all, even though I'm aware that (in our imaginary worst-case scenario) it can error out when the sidecar is not there serving requests.
|
||||
What if I’d used a `livenessProbe` without lifecycle `postStart`? Both containers will be immediately ready: at the beginning, this behavior will not be different from the one without any additional probes since the liveness probe doesn’t affect readiness at all. After a while, the sidecar will begin to restart itself, but it won’t influence the main container.
|
||||
|
||||
## Findings summary
|
||||
|
||||
I’ll summarize the startup behavior in the table below:
|
||||
|
||||
| Probe/Hook | Sidecar starts before the main app? | Main app waits for the sidecar to be ready? | What if the check doesn’t pass? |
|
||||
|----------------|----------------------------------------------------------|-----------------------------------------------------|----------------------------------------------------|
|
||||
| `readinessProbe` | **Yes**, but it’s almost in parallel (effectively **no**) | **No** | Sidecar is not ready; main app continues running |
|
||||
| `livenessProbe` | Yes, but it’s almost in parallel (effectively **no**) | **No** | Sidecar is restarted, main app continues running |
|
||||
| `startupProbe` | **Yes** | **Yes** | Main app is not started |
|
||||
| postStart | **Yes**, main app container starts after `postStart` completes | **Yes**, but you have to provide custom logic for that | Main app is not started |
|
||||
|
||||
To summarize: with sidecars often being a dependency of the main application, you may want to delay the start of the latter until the sidecar is healthy.
|
||||
The ideal pattern is to start both containers simultaneously and have the app container logic delay at all levels, but it’s not always possible. If that's what you need, you have to use the right kind of customization to the Pod definition. Thankfully, it’s nice and quick, and you have the recipe ready above.
|
||||
|
||||
Happy deploying!
|
|
@ -0,0 +1,135 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Introducing Gateway API Inference Extension"
|
||||
date: 2025-06-05
|
||||
slug: introducing-gateway-api-inference-extension
|
||||
draft: false
|
||||
author: >
|
||||
Daneyon Hansen (Solo.io),
|
||||
Kaushik Mitra (Google),
|
||||
Jiaxin Shan (Bytedance),
|
||||
Kellen Swain (Google)
|
||||
---
|
||||
|
||||
Modern generative AI and large language model (LLM) services create unique traffic-routing challenges
|
||||
on Kubernetes. Unlike typical short-lived, stateless web requests, LLM inference sessions are often
|
||||
long-running, resource-intensive, and partially stateful. For example, a single GPU-backed model server
|
||||
may keep multiple inference sessions active and maintain in-memory token caches.
|
||||
|
||||
Traditional load balancers focused on HTTP path or round-robin lack the specialized capabilities needed
|
||||
for these workloads. They also don’t account for model identity or request criticality (e.g., interactive
|
||||
chat vs. batch jobs). Organizations often patch together ad-hoc solutions, but a standardized approach
|
||||
is missing.
|
||||
|
||||
## Gateway API Inference Extension
|
||||
|
||||
[Gateway API Inference Extension](https://gateway-api-inference-extension.sigs.k8s.io/) was created to address
|
||||
this gap by building on the existing [Gateway API](https://gateway-api.sigs.k8s.io/), adding inference-specific
|
||||
routing capabilities while retaining the familiar model of Gateways and HTTPRoutes. By adding an inference
|
||||
extension to your existing gateway, you effectively transform it into an **Inference Gateway**, enabling you to
|
||||
self-host GenAI/LLMs with a “model-as-a-service” mindset.
|
||||
|
||||
The project’s goal is to improve and standardize routing to inference workloads across the ecosystem. Key
|
||||
objectives include enabling model-aware routing, supporting per-request criticalities, facilitating safe model
|
||||
roll-outs, and optimizing load balancing based on real-time model metrics. By achieving these, the project aims
|
||||
to reduce latency and improve accelerator (GPU) utilization for AI workloads.
|
||||
|
||||
## How it works
|
||||
|
||||
The design introduces two new Custom Resources (CRDs) with distinct responsibilities, each aligning with a
|
||||
specific user persona in the AI/ML serving workflow:
|
||||
|
||||
{{< figure src="inference-extension-resource-model.png" alt="Resource Model" class="diagram-large" clicktozoom="true" >}}
|
||||
|
||||
1. [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/)
|
||||
Defines a pool of pods (model servers) running on shared compute (e.g., GPU nodes). The platform admin can
|
||||
configure how these pods are deployed, scaled, and balanced. An InferencePool ensures consistent resource
|
||||
usage and enforces platform-wide policies. An InferencePool is similar to a Service but specialized for AI/ML
|
||||
serving needs and aware of the model-serving protocol.
|
||||
|
||||
2. [InferenceModel](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencemodel/)
|
||||
A user-facing model endpoint managed by AI/ML owners. It maps a public name (e.g., "gpt-4-chat") to the actual
|
||||
model within an InferencePool. This lets workload owners specify which models (and optional fine-tuning) they
|
||||
want served, plus a traffic-splitting or prioritization policy.
|
||||
|
||||
In summary, the InferenceModel API lets AI/ML owners manage what is served, while the InferencePool lets platform
|
||||
operators manage where and how it’s served.
|
||||
|
||||
## Request flow
|
||||
|
||||
The flow of a request builds on the Gateway API model (Gateways and HTTPRoutes) with one or more extra inference-aware
|
||||
steps (extensions) in the middle. Here’s a high-level example of the request flow with the
|
||||
[Endpoint Selection Extension (ESE)](https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension):
|
||||
|
||||
{{< figure src="inference-extension-request-flow.png" alt="Request Flow" class="diagram-large" clicktozoom="true" >}}
|
||||
|
||||
1. **Gateway Routing**
|
||||
A client sends a request (e.g., an HTTP POST to /completions). The Gateway (like Envoy) examines the HTTPRoute
|
||||
and identifies the matching InferencePool backend.
|
||||
|
||||
2. **Endpoint Selection**
|
||||
Instead of simply forwarding to any available pod, the Gateway consults an inference-specific routing extension—
|
||||
the Endpoint Selection Extension—to pick the best of the available pods. This extension examines live pod metrics
|
||||
(queue lengths, memory usage, loaded adapters) to choose the ideal pod for the request.
|
||||
|
||||
3. **Inference-Aware Scheduling**
|
||||
The chosen pod is the one that can handle the request with the lowest latency or highest efficiency, given the
|
||||
user’s criticality or resource needs. The Gateway then forwards traffic to that specific pod.
|
||||
|
||||
{{< figure src="inference-extension-epp-scheduling.png" alt="Endpoint Extension Scheduling" class="diagram-large" clicktozoom="true" >}}
|
||||
|
||||
This extra step provides a smarter, model-aware routing mechanism that still feels like a normal single request to
|
||||
the client. Additionally, the design is extensible—any Inference Gateway can be enhanced with additional inference-specific
|
||||
extensions to handle new routing strategies, advanced scheduling logic, or specialized hardware needs. As the project
|
||||
continues to grow, contributors are encouraged to develop new extensions that are fully compatible with the same underlying
|
||||
Gateway API model, further expanding the possibilities for efficient and intelligent GenAI/LLM routing.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
We evaluated this extension against a standard Kubernetes Service for a [vLLM](https://docs.vllm.ai/en/latest/)‐based model
|
||||
serving deployment. The test environment consisted of multiple H100 (80 GB) GPU pods running vLLM ([version 1](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html))
|
||||
on a Kubernetes cluster, with 10 Llama2 model replicas. The [Latency Profile Generator (LPG)](https://github.com/AI-Hypercomputer/inference-benchmark)
|
||||
tool was used to generate traffic and measure throughput, latency, and other metrics. The
|
||||
[ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json)
|
||||
dataset served as the workload, and traffic was ramped from 100 Queries per Second (QPS) up to 1000 QPS.
|
||||
|
||||
### Key results
|
||||
|
||||
{{< figure src="inference-extension-benchmark.png" alt="Endpoint Extension Scheduling" class="diagram-large" clicktozoom="true" >}}
|
||||
|
||||
- **Comparable Throughput**: Throughout the tested QPS range, the ESE delivered throughput roughly on par with a standard
|
||||
Kubernetes Service.
|
||||
|
||||
- **Lower Latency**:
|
||||
- **Per‐Output‐Token Latency**: The ESE showed significantly lower p90 latency at higher QPS (500+), indicating that
|
||||
its model-aware routing decisions reduce queueing and resource contention as GPU memory approaches saturation.
|
||||
- **Overall p90 Latency**: Similar trends emerged, with the ESE reducing end‐to‐end tail latencies compared to the
|
||||
baseline, particularly as traffic increased beyond 400–500 QPS.
|
||||
|
||||
These results suggest that this extension's model‐aware routing significantly reduced latency for GPU‐backed LLM
|
||||
workloads. By dynamically selecting the least‐loaded or best‐performing model server, it avoids hotspots that can
|
||||
appear when using traditional load balancing methods for large, long‐running inference requests.
|
||||
|
||||
## Roadmap
|
||||
|
||||
As the Gateway API Inference Extension heads toward GA, planned features include:
|
||||
|
||||
1. **Prefix-cache aware load balancing** for remote caches
|
||||
2. **LoRA adapter pipelines** for automated rollout
|
||||
3. **Fairness and priority** between workloads in the same criticality band
|
||||
4. **HPA support** for scaling based on aggregate, per-model metrics
|
||||
5. **Support for large multi-modal inputs/outputs**
|
||||
6. **Additional model types** (e.g., diffusion models)
|
||||
7. **Heterogeneous accelerators** (serving on multiple accelerator types with latency- and cost-aware load balancing)
|
||||
8. **Disaggregated serving** for independently scaling pools
|
||||
|
||||
## Summary
|
||||
|
||||
By aligning model serving with Kubernetes-native tooling, Gateway API Inference Extension aims to simplify
|
||||
and standardize how AI/ML traffic is routed. With model-aware routing, criticality-based prioritization, and
|
||||
more, it helps ops teams deliver the right LLM services to the right users—smoothly and efficiently.
|
||||
|
||||
**Ready to learn more?** Visit the [project docs](https://gateway-api-inference-extension.sigs.k8s.io/) to dive deeper,
|
||||
give an Inference Gateway extension a try with a few [simple steps](https://gateway-api-inference-extension.sigs.k8s.io/guides/),
|
||||
and [get involved](https://gateway-api-inference-extension.sigs.k8s.io/contributing/) if you’re interested in
|
||||
contributing to the project!
|
After Width: | Height: | Size: 209 KiB |
After Width: | Height: | Size: 51 KiB |
After Width: | Height: | Size: 81 KiB |
After Width: | Height: | Size: 101 KiB |
|
@ -0,0 +1,333 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Enhancing Kubernetes Event Management with Custom Aggregation"
|
||||
date: 2025-06-10
|
||||
draft: false
|
||||
slug: enhancing-kubernetes-event-management-custom-aggregation
|
||||
Author: >
|
||||
[Rez Moss](https://github.com/rezmoss)
|
||||
---
|
||||
|
||||
Kubernetes [Events](/docs/reference/kubernetes-api/cluster-resources/event-v1/) provide crucial insights into cluster operations, but as clusters grow, managing and analyzing these events becomes increasingly challenging. This blog post explores how to build custom event aggregation systems that help engineering teams better understand cluster behavior and troubleshoot issues more effectively.
|
||||
|
||||
## The challenge with Kubernetes events
|
||||
|
||||
In a Kubernetes cluster, events are generated for various operations - from pod scheduling and container starts to volume mounts and network configurations. While these events are invaluable for debugging and monitoring, several challenges emerge in production environments:
|
||||
|
||||
1. **Volume**: Large clusters can generate thousands of events per minute
|
||||
2. **Retention**: Default event retention is limited to one hour
|
||||
3. **Correlation**: Related events from different components are not automatically linked
|
||||
4. **Classification**: Events lack standardized severity or category classifications
|
||||
5. **Aggregation**: Similar events are not automatically grouped
|
||||
|
||||
To learn more about Events in Kubernetes, read the [Event](/docs/reference/kubernetes-api/cluster-resources/event-v1/) API reference.
|
||||
|
||||
## Real-World value
|
||||
|
||||
Consider a production environment with tens of microservices where the users report intermittent transaction failures:
|
||||
|
||||
**Traditional event aggregation process:** Engineers are wasting hours sifting through thousands of standalone events spread across namespaces. By the time they look into it, the older events have long since purged, and correlating pod restarts to node-level issues is practically impossible.
|
||||
|
||||
**With its event aggregation in its custom events:** The system groups events across resources, instantly surfacing correlation patterns such as volume mount timeouts before pod restarts. History indicates it occurred during past record traffic spikes, highlighting a storage scalability issue in minutes rather than hours.
|
||||
|
||||
The benefit of this approach is that organizations that implement it commonly cut down their troubleshooting time significantly along with increasing the reliability of systems by detecting patterns early.
|
||||
|
||||
## Building an Event aggregation system
|
||||
|
||||
This post explores how to build a custom event aggregation system that addresses these challenges, aligned to Kubernetes best practices. I've picked the Go programming language for my example.
|
||||
|
||||
### Architecture overview
|
||||
|
||||
This event aggregation system consists of three main components:
|
||||
|
||||
1. **Event Watcher**: Monitors the Kubernetes API for new events
|
||||
2. **Event Processor**: Processes, categorizes, and correlates events
|
||||
3. **Storage Backend**: Stores processed events for longer retention
|
||||
|
||||
Here's a sketch for how to implement the event watcher:
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
eventsv1 "k8s.io/api/events/v1"
|
||||
)
|
||||
|
||||
type EventWatcher struct {
|
||||
clientset *kubernetes.Clientset
|
||||
}
|
||||
|
||||
func NewEventWatcher(config *rest.Config) (*EventWatcher, error) {
|
||||
clientset, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &EventWatcher{clientset: clientset}, nil
|
||||
}
|
||||
|
||||
func (w *EventWatcher) Watch(ctx context.Context) (<-chan *eventsv1.Event, error) {
|
||||
events := make(chan *eventsv1.Event)
|
||||
|
||||
watcher, err := w.clientset.EventsV1().Events("").Watch(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer close(events)
|
||||
for {
|
||||
select {
|
||||
case event := <-watcher.ResultChan():
|
||||
if e, ok := event.Object.(*eventsv1.Event); ok {
|
||||
events <- e
|
||||
}
|
||||
case <-ctx.Done():
|
||||
watcher.Stop()
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return events, nil
|
||||
}
|
||||
```
|
||||
|
||||
### Event processing and classification
|
||||
|
||||
The event processor enriches events with additional context and classification:
|
||||
|
||||
```go
|
||||
type EventProcessor struct {
|
||||
categoryRules []CategoryRule
|
||||
correlationRules []CorrelationRule
|
||||
}
|
||||
|
||||
type ProcessedEvent struct {
|
||||
Event *eventsv1.Event
|
||||
Category string
|
||||
Severity string
|
||||
CorrelationID string
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
func (p *EventProcessor) Process(event *eventsv1.Event) *ProcessedEvent {
|
||||
processed := &ProcessedEvent{
|
||||
Event: event,
|
||||
Metadata: make(map[string]string),
|
||||
}
|
||||
|
||||
// Apply classification rules
|
||||
processed.Category = p.classifyEvent(event)
|
||||
processed.Severity = p.determineSeverity(event)
|
||||
|
||||
// Generate correlation ID for related events
|
||||
processed.CorrelationID = p.correlateEvent(event)
|
||||
|
||||
// Add useful metadata
|
||||
processed.Metadata = p.extractMetadata(event)
|
||||
|
||||
return processed
|
||||
}
|
||||
```
|
||||
|
||||
### Implementing Event correlation
|
||||
|
||||
One of the key features you could implement is a way of correlating related Events.
|
||||
Here's an example correlation strategy:
|
||||
|
||||
```go
|
||||
func (p *EventProcessor) correlateEvent(event *eventsv1.Event) string {
|
||||
// Correlation strategies:
|
||||
// 1. Time-based: Events within a time window
|
||||
// 2. Resource-based: Events affecting the same resource
|
||||
// 3. Causation-based: Events with cause-effect relationships
|
||||
|
||||
correlationKey := generateCorrelationKey(event)
|
||||
return correlationKey
|
||||
}
|
||||
|
||||
func generateCorrelationKey(event *eventsv1.Event) string {
|
||||
// Example: Combine namespace, resource type, and name
|
||||
return fmt.Sprintf("%s/%s/%s",
|
||||
event.InvolvedObject.Namespace,
|
||||
event.InvolvedObject.Kind,
|
||||
event.InvolvedObject.Name,
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
## Event storage and retention
|
||||
|
||||
For long-term storage and analysis, you'll probably want a backend that supports:
|
||||
- Efficient querying of large event volumes
|
||||
- Flexible retention policies
|
||||
- Support for aggregation queries
|
||||
|
||||
Here's a sample storage interface:
|
||||
|
||||
```go
|
||||
type EventStorage interface {
|
||||
Store(context.Context, *ProcessedEvent) error
|
||||
Query(context.Context, EventQuery) ([]ProcessedEvent, error)
|
||||
Aggregate(context.Context, AggregationParams) ([]EventAggregate, error)
|
||||
}
|
||||
|
||||
type EventQuery struct {
|
||||
TimeRange TimeRange
|
||||
Categories []string
|
||||
Severity []string
|
||||
CorrelationID string
|
||||
Limit int
|
||||
}
|
||||
|
||||
type AggregationParams struct {
|
||||
GroupBy []string
|
||||
TimeWindow string
|
||||
Metrics []string
|
||||
}
|
||||
```
|
||||
|
||||
## Good practices for Event management
|
||||
|
||||
1. **Resource Efficiency**
|
||||
- Implement rate limiting for event processing
|
||||
- Use efficient filtering at the API server level
|
||||
- Batch events for storage operations
|
||||
|
||||
2. **Scalability**
|
||||
- Distribute event processing across multiple workers
|
||||
- Use leader election for coordination
|
||||
- Implement backoff strategies for API rate limits
|
||||
|
||||
3. **Reliability**
|
||||
- Handle API server disconnections gracefully
|
||||
- Buffer events during storage backend unavailability
|
||||
- Implement retry mechanisms with exponential backoff
|
||||
|
||||
## Advanced features
|
||||
|
||||
### Pattern detection
|
||||
|
||||
Implement pattern detection to identify recurring issues:
|
||||
|
||||
```go
|
||||
type PatternDetector struct {
|
||||
patterns map[string]*Pattern
|
||||
threshold int
|
||||
}
|
||||
|
||||
func (d *PatternDetector) Detect(events []ProcessedEvent) []Pattern {
|
||||
// Group similar events
|
||||
groups := groupSimilarEvents(events)
|
||||
|
||||
// Analyze frequency and timing
|
||||
patterns := identifyPatterns(groups)
|
||||
|
||||
return patterns
|
||||
}
|
||||
|
||||
func groupSimilarEvents(events []ProcessedEvent) map[string][]ProcessedEvent {
|
||||
groups := make(map[string][]ProcessedEvent)
|
||||
|
||||
for _, event := range events {
|
||||
// Create similarity key based on event characteristics
|
||||
similarityKey := fmt.Sprintf("%s:%s:%s",
|
||||
event.Event.Reason,
|
||||
event.Event.InvolvedObject.Kind,
|
||||
event.Event.InvolvedObject.Namespace,
|
||||
)
|
||||
|
||||
// Group events with the same key
|
||||
groups[similarityKey] = append(groups[similarityKey], event)
|
||||
}
|
||||
|
||||
return groups
|
||||
}
|
||||
|
||||
|
||||
func identifyPatterns(groups map[string][]ProcessedEvent) []Pattern {
|
||||
var patterns []Pattern
|
||||
|
||||
for key, events := range groups {
|
||||
// Only consider groups with enough events to form a pattern
|
||||
if len(events) < 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Sort events by time
|
||||
sort.Slice(events, func(i, j int) bool {
|
||||
return events[i].Event.LastTimestamp.Time.Before(events[j].Event.LastTimestamp.Time)
|
||||
})
|
||||
|
||||
// Calculate time range and frequency
|
||||
firstSeen := events[0].Event.FirstTimestamp.Time
|
||||
lastSeen := events[len(events)-1].Event.LastTimestamp.Time
|
||||
duration := lastSeen.Sub(firstSeen).Minutes()
|
||||
|
||||
var frequency float64
|
||||
if duration > 0 {
|
||||
frequency = float64(len(events)) / duration
|
||||
}
|
||||
|
||||
// Create a pattern if it meets threshold criteria
|
||||
if frequency > 0.5 { // More than 1 event per 2 minutes
|
||||
pattern := Pattern{
|
||||
Type: key,
|
||||
Count: len(events),
|
||||
FirstSeen: firstSeen,
|
||||
LastSeen: lastSeen,
|
||||
Frequency: frequency,
|
||||
EventSamples: events[:min(3, len(events))], // Keep up to 3 samples
|
||||
}
|
||||
patterns = append(patterns, pattern)
|
||||
}
|
||||
}
|
||||
|
||||
return patterns
|
||||
}
|
||||
|
||||
|
||||
```
|
||||
|
||||
With this implementation, the system can identify recurring patterns such as node pressure events, pod scheduling failures, or networking issues that occur with a specific frequency.
|
||||
|
||||
### Real-time alerts
|
||||
|
||||
The following example provides a starting point for building an alerting system based on event patterns. It is not a complete solution but a conceptual sketch to illustrate the approach.
|
||||
|
||||
```go
|
||||
type AlertManager struct {
|
||||
rules []AlertRule
|
||||
notifiers []Notifier
|
||||
}
|
||||
|
||||
func (a *AlertManager) EvaluateEvents(events []ProcessedEvent) {
|
||||
for _, rule := range a.rules {
|
||||
if rule.Matches(events) {
|
||||
alert := rule.GenerateAlert(events)
|
||||
a.notify(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
A well-designed event aggregation system can significantly improve cluster observability and troubleshooting capabilities. By implementing custom event processing, correlation, and storage, operators can better understand cluster behavior and respond to issues more effectively.
|
||||
|
||||
The solutions presented here can be extended and customized based on specific requirements while maintaining compatibility with the Kubernetes API and following best practices for scalability and reliability.
|
||||
|
||||
## Next steps
|
||||
|
||||
Future enhancements could include:
|
||||
- Machine learning for anomaly detection
|
||||
- Integration with popular observability platforms
|
||||
- Custom event APIs for application-specific events
|
||||
- Enhanced visualization and reporting capabilities
|
||||
|
||||
For more information on Kubernetes events and custom [controllers](/docs/concepts/architecture/controller/),
|
||||
refer to the official Kubernetes [documentation](/docs/).
|
|
@ -0,0 +1,23 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Changes to Kubernetes Slack"
|
||||
date: 2025-06-16
|
||||
canonicalUrl: https://www.kubernetes.dev/blog/2025/06/16/changes-to-kubernetes-slack-2025/
|
||||
slug: changes-to-kubernetes-slack
|
||||
Author: >
|
||||
[Josh Berkus](https://github.com/jberkus)
|
||||
---
|
||||
|
||||
**UPDATE**: We’ve received notice from Salesforce that our Slack workspace **WILL NOT BE DOWNGRADED** on June 20th. Stand by for more details, but for now, there is no urgency to back up private channels or direct messages.
|
||||
|
||||
~~Kubernetes Slack will lose its special status and will be changing into a standard free Slack on June 20, 2025~~. Sometime later this year, our community may move to a new platform. If you are responsible for a channel or private channel, or a member of a User Group, you will need to take some actions as soon as you can.
|
||||
|
||||
For the last decade, Slack has supported our project with a free customized enterprise account. They have let us know that they can no longer do so, particularly since our Slack is one of the largest and more active ones on the platform. As such, they will be downgrading it to a standard free Slack while we decide on, and implement, other options.
|
||||
|
||||
On Friday, June 20, we will be subject to the [feature limitations of free Slack](https://slack.com/help/articles/27204752526611-Feature-limitations-on-the-free-version-of-Slack). The primary ones which will affect us will be only retaining 90 days of history, and having to disable several apps and workflows which we are currently using. The Slack Admin team will do their best to manage these limitations.
|
||||
|
||||
Responsible channel owners, members of private channels, and members of User Groups should [take some actions](https://github.com/kubernetes/community/blob/master/communication/slack-migration-faq.md#what-actions-do-channel-owners-and-user-group-members-need-to-take-soon) to prepare for the upgrade and preserve information as soon as possible.
|
||||
|
||||
The CNCF Projects Staff have proposed that our community look at migrating to Discord. Because of existing issues where we have been pushing the limits of Slack, they have already explored what a Kubernetes Discord would look like. Discord would allow us to implement new tools and integrations which would help the community, such as GitHub group membership synchronization. The Steering Committee will discuss and decide on our future platform.
|
||||
|
||||
Please see our [FAQ](https://github.com/kubernetes/community/blob/master/communication/slack-migration-faq.md), and check the [kubernetes-dev mailing list](https://groups.google.com/a/kubernetes.io/g/dev/) and the [#announcements channel](https://kubernetes.slack.com/archives/C9T0QMNG4) for further news. If you have specific feedback on our Slack status join the [discussion on GitHub](https://github.com/kubernetes/community/issues/8490).
|
|
@ -0,0 +1,170 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Image Compatibility In Cloud Native Environments"
|
||||
date: 2025-06-25
|
||||
draft: false
|
||||
slug: image-compatibility-in-cloud-native-environments
|
||||
author: >
|
||||
Chaoyi Huang (Huawei),
|
||||
Marcin Franczyk (Huawei),
|
||||
Vanessa Sochat (Lawrence Livermore National Laboratory)
|
||||
---
|
||||
|
||||
In industries where systems must run very reliably and meet strict performance criteria such as telecommunication, high-performance or AI computing, containerized applications often need specific operating system configuration or hardware presence.
|
||||
It is common practice to require the use of specific versions of the kernel, its configuration, device drivers, or system components.
|
||||
Despite the existence of the [Open Container Initiative (OCI)](https://opencontainers.org/), a governing community to define standards and specifications for container images, there has been a gap in expression of such compatibility requirements.
|
||||
The need to address this issue has led to different proposals and, ultimately, an implementation in Kubernetes' [Node Feature Discovery (NFD)](https://kubernetes-sigs.github.io/node-feature-discovery/stable/get-started/index.html).
|
||||
|
||||
[NFD](https://kubernetes-sigs.github.io/node-feature-discovery/stable/get-started/index.html) is an open source Kubernetes project that automatically detects and reports [hardware and system features](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/customization-guide.html#available-features) of cluster nodes. This information helps users to schedule workloads on nodes that meet specific system requirements, which is especially useful for applications with strict hardware or operating system dependencies.
|
||||
|
||||
## The need for image compatibility specification
|
||||
|
||||
### Dependencies between containers and host OS
|
||||
|
||||
A container image is built on a base image, which provides a minimal runtime environment, often a stripped-down Linux userland, completely empty or distroless. When an application requires certain features from the host OS, compatibility issues arise. These dependencies can manifest in several ways:
|
||||
|
||||
- **Drivers**:
|
||||
Host driver versions must match the supported range of a library version inside the container to avoid compatibility problems. Examples include GPUs and network drivers.
|
||||
- **Libraries or Software**:
|
||||
The container must come with a specific version or range of versions for a library or software to run optimally in the environment. Examples from high performance computing are MPI, EFA, or Infiniband.
|
||||
- **Kernel Modules or Features**:
|
||||
Specific kernel features or modules must be present. Examples include having support of write protected huge page faults, or the presence of VFIO
|
||||
- And more…
|
||||
|
||||
While containers in Kubernetes are the most likely unit of abstraction for these needs, the definition of compatibility can extend further to include other container technologies such as Singularity and other OCI artifacts such as binaries from a spack binary cache.
|
||||
|
||||
### Multi-cloud and hybrid cloud challenges
|
||||
|
||||
Containerized applications are deployed across various Kubernetes distributions and cloud providers, where different host operating systems introduce compatibility challenges.
|
||||
Often those have to be pre-configured before workload deployment or are immutable.
|
||||
For instance, different cloud providers will include different operating systems like:
|
||||
|
||||
- **RHCOS/RHEL**
|
||||
- **Photon OS**
|
||||
- **Amazon Linux 2**
|
||||
- **Container-Optimized OS**
|
||||
- **Azure Linux OS**
|
||||
- And more...
|
||||
|
||||
Each OS comes with unique kernel versions, configurations, and drivers, making compatibility a non-trivial issue for applications requiring specific features.
|
||||
It must be possible to quickly assess a container for its suitability to run on any specific environment.
|
||||
|
||||
### Image compatibility initiative
|
||||
|
||||
An effort was made within the [Open Containers Initiative Image Compatibility](https://github.com/opencontainers/wg-image-compatibility) working group to introduce a standard for image compatibility metadata.
|
||||
A specification for compatibility would allow container authors to declare required host OS features, making compatibility requirements discoverable and programmable.
|
||||
The specification implemented in Kubernetes Node Feature Discovery is one of the discussed proposals.
|
||||
It aims to:
|
||||
|
||||
- **Define a structured way to express compatibility in OCI image manifests.**
|
||||
- **Support a compatibility specification alongside container images in image registries.**
|
||||
- **Allow automated validation of compatibility before scheduling containers.**
|
||||
|
||||
The concept has since been implemented in the Kubernetes Node Feature Discovery project.
|
||||
|
||||
### Implementation in Node Feature Discovery
|
||||
|
||||
The solution integrates compatibility metadata into Kubernetes via NFD features and the [NodeFeatureGroup](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup) API.
|
||||
This interface enables the user to match containers to nodes based on exposing features of hardware and software, allowing for intelligent scheduling and workload optimization.
|
||||
|
||||
### Compatibility specification
|
||||
|
||||
The compatibility specification is a structured list of compatibility objects containing *[Node Feature Groups](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup)*.
|
||||
These objects define image requirements and facilitate validation against host nodes.
|
||||
The feature requirements are described by using [the list of available features](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/customization-guide.html#available-features) from the NFD project.
|
||||
The schema has the following structure:
|
||||
|
||||
- **version** (string) - Specifies the API version.
|
||||
- **compatibilities** (array of objects) - List of compatibility sets.
|
||||
- **rules** (object) - Specifies [NodeFeatureGroup](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup) to define image requirements.
|
||||
- **weight** (int, optional) - Node affinity weight.
|
||||
- **tag** (string, optional) - Categorization tag.
|
||||
- **description** (string, optional) - Short description.
|
||||
|
||||
An example might look like the following:
|
||||
|
||||
```yaml
|
||||
version: v1alpha1
|
||||
compatibilities:
|
||||
- description: "My image requirements"
|
||||
rules:
|
||||
- name: "kernel and cpu"
|
||||
matchFeatures:
|
||||
- feature: kernel.loadedmodule
|
||||
matchExpressions:
|
||||
vfio-pci: {op: Exists}
|
||||
- feature: cpu.model
|
||||
matchExpressions:
|
||||
vendor_id: {op: In, value: ["Intel", "AMD"]}
|
||||
- name: "one of available nics"
|
||||
matchAny:
|
||||
- matchFeatures:
|
||||
- feature: pci.device
|
||||
matchExpressions:
|
||||
vendor: {op: In, value: ["0eee"]}
|
||||
class: {op: In, value: ["0200"]}
|
||||
- matchFeatures:
|
||||
- feature: pci.device
|
||||
matchExpressions:
|
||||
vendor: {op: In, value: ["0fff"]}
|
||||
class: {op: In, value: ["0200"]}
|
||||
```
|
||||
|
||||
### Client implementation for node validation
|
||||
|
||||
To streamline compatibility validation, we implemented a [client tool](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/reference/node-feature-client-reference.html) that allows for node validation based on an image's compatibility artifact.
|
||||
In this workflow, the image author would generate a compatibility artifact that points to the image it describes in a registry via the referrers API.
|
||||
When a need arises to assess the fit of an image to a host, the tool can discover the artifact and verify compatibility of an image to a node before deployment.
|
||||
The client can validate nodes both inside and outside a Kubernetes cluster, extending the utility of the tool beyond the single Kubernetes use case.
|
||||
In the future, image compatibility could play a crucial role in creating specific workload profiles based on image compatibility requirements, aiding in more efficient scheduling.
|
||||
Additionally, it could potentially enable automatic node configuration to some extent, further optimizing resource allocation and ensuring seamless deployment of specialized workloads.
|
||||
|
||||
### Examples of usage
|
||||
|
||||
1. **Define image compatibility metadata**
|
||||
|
||||
A [container image](/docs/concepts/containers/images) can have metadata that describes
|
||||
its requirements based on features discovered from nodes, like kernel modules or CPU models.
|
||||
The previous compatibility specification example in this article exemplified this use case.
|
||||
|
||||
2. **Attach the artifact to the image**
|
||||
|
||||
The image compatibility specification is stored as an OCI artifact.
|
||||
You can attach this metadata to your container image using the [oras](https://oras.land/) tool.
|
||||
The registry only needs to support OCI artifacts, support for arbitrary types is not required.
|
||||
Keep in mind that the container image and the artifact must be stored in the same registry.
|
||||
Use the following command to attach the artifact to the image:
|
||||
|
||||
```bash
|
||||
oras attach \
|
||||
--artifact-type application/vnd.nfd.image-compatibility.v1alpha1 <image-url> \
|
||||
<path-to-spec>.yaml:application/vnd.nfd.image-compatibility.spec.v1alpha1+yaml
|
||||
```
|
||||
|
||||
3. **Validate image compatibility**
|
||||
|
||||
After attaching the compatibility specification, you can validate whether a node meets the
|
||||
image's requirements. This validation can be done using the
|
||||
[nfd client](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/reference/node-feature-client-reference.html):
|
||||
|
||||
```bash
|
||||
nfd compat validate-node --image <image-url>
|
||||
```
|
||||
|
||||
4. **Read the output from the client**
|
||||
|
||||
Finally you can read the report generated by the tool or use your own tools to act based on the generated JSON report.
|
||||
|
||||

|
||||
|
||||
## Conclusion
|
||||
|
||||
The addition of image compatibility to Kubernetes through Node Feature Discovery underscores the growing importance of addressing compatibility in cloud native environments.
|
||||
It is only a start, as further work is needed to integrate compatibility into scheduling of workloads within and outside of Kubernetes.
|
||||
However, by integrating this feature into Kubernetes, mission-critical workloads can now define and validate host OS requirements more efficiently.
|
||||
Moving forward, the adoption of compatibility metadata within Kubernetes ecosystems will significantly enhance the reliability and performance of specialized containerized applications, ensuring they meet the stringent requirements of industries like telecommunications, high-performance computing or any environment that requires special hardware or host OS configuration.
|
||||
|
||||
## Get involved
|
||||
|
||||
Join the [Kubernetes Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/contributing/) project if you're interested in getting involved with the design and development of Image Compatibility API and tools.
|
||||
We always welcome new contributors.
|
After Width: | Height: | Size: 117 KiB |
|
@ -0,0 +1,410 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Navigating Failures in Pods With Devices"
|
||||
date: 2025-07-03
|
||||
slug: navigating-failures-in-pods-with-devices
|
||||
draft: false
|
||||
author: >
|
||||
Sergey Kanzhelev (Google)
|
||||
Mrunal Patel (RedHat)
|
||||
---
|
||||
|
||||
Kubernetes is the de facto standard for container orchestration, but when it
|
||||
comes to handling specialized hardware like GPUs and other accelerators, things
|
||||
get a bit complicated. This blog post dives into the challenges of managing
|
||||
failure modes when operating pods with devices in Kubernetes, based on insights
|
||||
from [Sergey Kanzhelev and Mrunal Patel's talk at KubeCon NA
|
||||
2024](https://sched.co/1i7pT). You can follow the links to
|
||||
[slides](https://static.sched.com/hosted_files/kccncna2024/b9/KubeCon%20NA%202024_%20Navigating%20Failures%20in%20Pods%20With%20Devices_%20Challenges%20and%20Solutions.pptx.pdf?_gl=1*191m4j5*_gcl_au*MTU1MDM0MTM1My4xNzMwOTE4ODY5LjIxNDI4Nzk1NDIuMTczMTY0ODgyMC4xNzMxNjQ4ODIy*FPAU*MTU1MDM0MTM1My4xNzMwOTE4ODY5)
|
||||
and
|
||||
[recording](https://www.youtube.com/watch?v=-YCnOYTtVO8&list=PLj6h78yzYM2Pw4mRw4S-1p_xLARMqPkA7&index=150).
|
||||
|
||||
## The AI/ML boom and its impact on Kubernetes
|
||||
|
||||
The rise of AI/ML workloads has brought new challenges to Kubernetes. These
|
||||
workloads often rely heavily on specialized hardware, and any device failure can
|
||||
significantly impact performance and lead to frustrating interruptions. As
|
||||
highlighted in the 2024 [Llama
|
||||
paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/),
|
||||
hardware issues, particularly GPU failures, are a major cause of disruption in
|
||||
AI/ML training. You can also learn how much effort NVIDIA spends on handling
|
||||
devices failures and maintenance in the KubeCon talk by [Ryan Hallisey and Piotr
|
||||
Prokop All-Your-GPUs-Are-Belong-to-Us: An Inside Look at NVIDIA's Self-Healing
|
||||
GeForce NOW
|
||||
Infrastructure](https://kccncna2024.sched.com/event/1i7kJ/all-your-gpus-are-belong-to-us-an-inside-look-at-nvidias-self-healing-geforce-now-infrastructure-ryan-hallisey-piotr-prokop-pl-nvidia)
|
||||
([recording](https://www.youtube.com/watch?v=iLnHtKwmu2I)) as they see 19
|
||||
remediation requests per 1000 nodes a day!
|
||||
We also see data centers offering spot consumption models and overcommit on
|
||||
power, making device failures commonplace and a part of the business model.
|
||||
|
||||
However, Kubernetes’s view on resources is still very static. The resource is
|
||||
either there or not. And if it is there, the assumption is that it will stay
|
||||
there fully functional - Kubernetes lacks good support for handling full or partial
|
||||
hardware failures. These long-existing assumptions combined with the overall complexity of a setup lead
|
||||
to a variety of failure modes, which we discuss here.
|
||||
|
||||
### Understanding AI/ML workloads
|
||||
|
||||
Generally, all AI/ML workloads require specialized hardware, have challenging
|
||||
scheduling requirements, and are expensive when idle. AI/ML workloads typically
|
||||
fall into two categories - training and inference. Here is an oversimplified
|
||||
view of those categories’ characteristics, which are different from traditional workloads
|
||||
like web services:
|
||||
|
||||
Training
|
||||
: These workloads are resource-intensive, often consuming entire
|
||||
machines and running as gangs of pods. Training jobs are usually "run to
|
||||
completion" - but that could be days, weeks or even months. Any failure in a
|
||||
single pod can necessitate restarting the entire step across all the pods.
|
||||
|
||||
Inference
|
||||
: These workloads are usually long-running or run indefinitely,
|
||||
and can be small enough to consume a subset of a Node’s devices or large enough to span
|
||||
multiple nodes. They often require downloading huge files with the model
|
||||
weights.
|
||||
|
||||
These workload types specifically break many past assumptions:
|
||||
|
||||
{{< table caption="Workload assumptions before and now" >}}
|
||||
| Before | Now |
|
||||
| :---- | :---- |
|
||||
| Can get a better CPU and the app will work faster. | Require a **specific** device (or **class of devices**) to run. |
|
||||
| When something doesn’t work, just recreate it. | Allocation or reallocation is expensive. |
|
||||
| Any node will work. No need to coordinate between Pods. | Scheduled in a special way - devices often connected in a cross-node topology. |
|
||||
| Each Pod can be plug-and-play replaced if failed. | Pods are a part of a larger task. Lifecycle of an entire task depends on each Pod. |
|
||||
| Container images are slim and easily available. | Container images may be so big that they require special handling. |
|
||||
| Long initialization can be offset by slow rollout. | Initialization may be long and should be optimized, sometimes across many Pods together. |
|
||||
| Compute nodes are commoditized and relatively inexpensive, so some idle time is acceptable. | Nodes with specialized hardware can be an order of magnitude more expensive than those without, so idle time is very wasteful. |
|
||||
{{< /table >}}
|
||||
|
||||
The existing failure model was relying on old assumptions. It may still work for
|
||||
the new workload types, but it has limited knowledge about devices and is very
|
||||
expensive for them. In some cases, even prohibitively expensive. You will see
|
||||
more examples later in this article.
|
||||
|
||||
### Why Kubernetes still reigns supreme
|
||||
|
||||
This article is not going deeper into the question: why not start fresh for
|
||||
AI/ML workloads since they are so different from the traditional Kubernetes
|
||||
workloads. Despite many challenges, Kubernetes remains the platform of choice
|
||||
for AI/ML workloads. Its maturity, security, and rich ecosystem of tools make it
|
||||
a compelling option. While alternatives exist, they often lack the years of
|
||||
development and refinement that Kubernetes offers. And the Kubernetes developers
|
||||
are actively addressing the gaps identified in this article and beyond.
|
||||
|
||||
## The current state of device failure handling
|
||||
|
||||
This section outlines different failure modes and the best practices and DIY
|
||||
(Do-It-Yourself) solutions used today. The next session will describe a roadmap
|
||||
of improving things for those failure modes.
|
||||
|
||||
### Failure modes: K8s infrastructure
|
||||
|
||||
In order to understand the failures related to the Kubernetes infrastructure,
|
||||
you need to understand how many moving parts are involved in scheduling a Pod on
|
||||
the node. The sequence of events when the Pod is scheduled in the Node is as
|
||||
follows:
|
||||
|
||||
1. *Device plugin* is scheduled on the Node
|
||||
1. *Device plugin* is registered with the *kubelet* via local gRPC
|
||||
1. *Kubelet* uses *device plugin* to watch for devices and updates capacity of
|
||||
the node
|
||||
1. *Scheduler* places a *user Pod* on a Node based on the updated capacity
|
||||
1. *Kubelet* asks *Device plugin* to **Allocate** devices for a *User Pod*
|
||||
1. *Kubelet* creates a *User Pod* with the allocated devices attached to it
|
||||
|
||||
This diagram shows some of those actors involved:
|
||||
|
||||
{{< figure src="k8s-infra-devices.svg" alt="The diagram shows relationships between the kubelet, Device plugin, and a user Pod. It shows that kubelet connects to the Device plugin named my-device, kubelet reports the node status with the my-device availability, and the user Pod requesting the 2 of my-device." >}}
|
||||
|
||||
As there are so many actors interconnected, every one of them and every
|
||||
connection may experience interruptions. This leads to many exceptional
|
||||
situations that are often considered failures, and may cause serious workload
|
||||
interruptions:
|
||||
|
||||
* Pods failing admission at various stages of its lifecycle
|
||||
* Pods unable to run on perfectly fine hardware
|
||||
* Scheduling taking unexpectedly long time
|
||||
|
||||
{{< figure src="k8s-infra-failures.svg" alt="The same diagram as one above it, however it has an overlayed orange bang drawings over individual components with the text indicating what can break in that component. Over the kubelet text reads: 'kubelet restart: looses all devices info before re-Watch'. Over the Device plugin text reads: 'device plugin update, evictIon, restart: kubelet cannot Allocate devices or loses all devices state'. Over the user Pod text reads: 'slow pod termination: devices are unavailable'." >}}
|
||||
|
||||
The goal for Kubernetes is to make the interruption between these components as
|
||||
reliable as possible. Kubelet already implements retries, grace periods, and
|
||||
other techniques to improve it. The roadmap section goes into details on other
|
||||
edge cases that the Kubernetes project tracks. However, all these improvements
|
||||
only work when these best practices are followed:
|
||||
|
||||
* Configure and restart kubelet and the container runtime (such as containerd or CRI-O)
|
||||
as early as possible to not interrupt the workload.
|
||||
* Monitor device plugin health and carefully plan for upgrades.
|
||||
* Do not overload the node with less-important workloads to prevent interruption
|
||||
of device plugin and other components.
|
||||
* Configure user pods tolerations to handle node readiness flakes.
|
||||
* Configure and code graceful termination logic carefully to not block devices
|
||||
for too long.
|
||||
|
||||
Another class of Kubernetes infra-related issues is driver-related. With
|
||||
traditional resources like CPU and memory, no compatibility checks between the
|
||||
application and hardware were needed. With special devices like hardware
|
||||
accelerators, there are new failure modes. Device drivers installed on the node:
|
||||
|
||||
* Must match the hardware
|
||||
* Be compatible with an app
|
||||
* Must work with other drivers (like [nccl](https://developer.nvidia.com/nccl),
|
||||
etc.)
|
||||
|
||||
Best practices for handling driver versions:
|
||||
|
||||
* Monitor driver installer health
|
||||
* Plan upgrades of infrastructure and Pods to match the version
|
||||
* Have canary deployments whenever possible
|
||||
|
||||
Following the best practices in this section and using device plugins and device
|
||||
driver installers from trusted and reliable sources generally eliminate this
|
||||
class of failures. Kubernetes is tracking work to make this space even better.
|
||||
|
||||
### Failure modes: device failed
|
||||
|
||||
There is very little handling of device failure in Kubernetes today. Device
|
||||
plugins report the device failure only by changing the count of allocatable
|
||||
devices. And Kubernetes relies on standard mechanisms like liveness probes or
|
||||
container failures to allow Pods to communicate the failure condition to the
|
||||
kubelet. However, Kubernetes does not correlate device failures with container
|
||||
crashes and does not offer any mitigation beyond restarting the container while
|
||||
being attached to the same device.
|
||||
|
||||
This is why many plugins and DIY solutions exist to handle device failures based
|
||||
on various signals.
|
||||
|
||||
#### Health controller
|
||||
|
||||
In many cases a failed device will result in unrecoverable and very expensive
|
||||
nodes doing nothing. A simple DIY solution is a _node health controller_. The
|
||||
controller could compare the device allocatable count with the capacity and if
|
||||
the capacity is greater, it starts a timer. Once the timer reaches a threshold,
|
||||
the health controller kills and recreates a node.
|
||||
|
||||
There are problems with the _health controller_ approach:
|
||||
|
||||
* Root cause of the device failure is typically not known
|
||||
* The controller is not workload aware
|
||||
* Failed device might not be in use and you want to keep other devices running
|
||||
* The detection may be too slow as it is very generic
|
||||
* The node may be part of a bigger set of nodes and simply cannot be deleted in
|
||||
isolation without other nodes
|
||||
|
||||
There are variations of the health controller solving some of the problems
|
||||
above. The overall theme here though is that to best handle failed devices, you
|
||||
need customized handling for the specific workload. Kubernetes doesn’t yet offer
|
||||
enough abstraction to express how critical the device is for a node, for the
|
||||
cluster, and for the Pod it is assigned to.
|
||||
|
||||
#### Pod failure policy
|
||||
|
||||
Another DIY approach for device failure handling is a per-pod reaction on a
|
||||
failed device. This approach is applicable for *training* workloads that are
|
||||
implemented as Jobs.
|
||||
|
||||
Pod can define special error codes for device failures. For example, whenever
|
||||
unexpected device behavior is encountered, Pod exits with a special exit code.
|
||||
Then the Pod failure policy can handle the device failure in a special way. Read
|
||||
more on [Handling retriable and non-retriable pod failures with Pod failure
|
||||
policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
|
||||
|
||||
There are some problems with the _Pod failure policy_ approach for Jobs:
|
||||
|
||||
* There is no well-known _device failed_ condition, so this approach does not work for the
|
||||
generic Pod case
|
||||
* Error codes must be coded carefully and in some cases are hard to guarantee.
|
||||
* Only works with Jobs with `restartPolicy: Never`, due to the limitation of a pod
|
||||
failure policy feature.
|
||||
|
||||
So, this solution has limited applicability.
|
||||
|
||||
#### Custom pod watcher
|
||||
|
||||
A little more generic approach is to implement the Pod watcher as a DIY solution
|
||||
or use some third party tools offering this functionality. The pod watcher is
|
||||
most often used to handle device failures for inference workloads.
|
||||
|
||||
Since Kubernetes just keeps a pod assigned to a device, even if the device is
|
||||
reportedly unhealthy, the idea is to detect this situation with the pod watcher
|
||||
and apply some remediation. It often involves obtaining device health status and
|
||||
its mapping to the Pod using Pod Resources API on the node. If a device fails,
|
||||
it can then delete the attached Pod as a remediation. The replica set will
|
||||
handle the Pod recreation on a healthy device.
|
||||
|
||||
The other reasons to implement this watcher:
|
||||
|
||||
* Without it, the Pod will keep being assigned to the failed device forever.
|
||||
* There is no _descheduling_ for a pod with `restartPolicy=Always`.
|
||||
* There are no built-in controllers that delete Pods in CrashLoopBackoff.
|
||||
|
||||
Problems with the _custom pod watcher_:
|
||||
|
||||
* The signal for the pod watcher is expensive to get, and involves some
|
||||
privileged actions.
|
||||
* It is a custom solution and it assumes the importance of a device for a Pod.
|
||||
* The pod watcher relies on external controllers to reschedule a Pod.
|
||||
|
||||
There are more variations of DIY solutions for handling device failures or
|
||||
upcoming maintenance. Overall, Kubernetes has enough extension points to
|
||||
implement these solutions. However, some extension points require higher
|
||||
privilege than users may be comfortable with or are too disruptive. The roadmap
|
||||
section goes into more details on specific improvements in handling the device
|
||||
failures.
|
||||
|
||||
### Failure modes: container code failed
|
||||
|
||||
When the container code fails or something bad happens with it, like out of
|
||||
memory conditions, Kubernetes knows how to handle those cases. There is either
|
||||
the restart of a container, or a crash of a Pod if it has `restartPolicy: Never`
|
||||
and scheduling it on another node. Kubernetes has limited expressiveness on what
|
||||
is a failure (for example, non-zero exit code or liveness probe failure) and how
|
||||
to react on such a failure (mostly either Always restart or immediately fail the
|
||||
Pod).
|
||||
|
||||
This level of expressiveness is often not enough for the complicated AI/ML
|
||||
workloads. AI/ML pods are better rescheduled locally or even in-place as that
|
||||
would save on image pulling time and device allocation. AI/ML pods are often
|
||||
interconnected and need to be restarted together. This adds another level of
|
||||
complexity and optimizing it often brings major savings in running AI/ML
|
||||
workloads.
|
||||
|
||||
There are various DIY solutions to handle Pod failures orchestration. The most
|
||||
typical one is to wrap a main executable in a container by some orchestrator.
|
||||
And this orchestrator will be able to restart the main executable whenever the
|
||||
job needs to be restarted because some other pod has failed.
|
||||
|
||||
Solutions like this are very fragile and elaborate. They are often worth the
|
||||
money saved comparing to a regular JobSet delete/recreate cycle when used in
|
||||
large training jobs. Making these solutions less fragile and more streamlined
|
||||
by developing new hooks and extension points in Kubernetes will make it
|
||||
easy to apply to smaller jobs, benefiting everybody.
|
||||
|
||||
### Failure modes: device degradation
|
||||
|
||||
Not all device failures are terminal for the overall workload or batch job.
|
||||
As the hardware stack gets more and more
|
||||
complex, misconfiguration on one of the hardware stack layers, or driver
|
||||
failures, may result in devices that are functional, but lagging on performance.
|
||||
One device that is lagging behind can slow down the whole training job.
|
||||
|
||||
We see reports of such cases more and more often. Kubernetes has no way to
|
||||
express this type of failures today and since it is the newest type of failure
|
||||
mode, there is not much of a best practice offered by hardware vendors for
|
||||
detection and third party tooling for remediation of these situations.
|
||||
|
||||
Typically, these failures are detected based on observed workload
|
||||
characteristics. For example, the expected speed of AI/ML training steps on
|
||||
particular hardware. Remediation for those issues is highly depend on a workload needs.
|
||||
|
||||
## Roadmap
|
||||
|
||||
As outlined in a section above, Kubernetes offers a lot of extension points
|
||||
which are used to implement various DIY solutions. The space of AI/ML is
|
||||
developing very fast, with changing requirements and usage patterns. SIG Node is
|
||||
taking a measured approach of enabling more extension points to implement the
|
||||
workload-specific scenarios over introduction of new semantics to support
|
||||
specific scenarios. This means prioritizing making information about failures
|
||||
readily available over implementing automatic remediations for those failures
|
||||
that might only be suitable for a subset of workloads.
|
||||
|
||||
This approach ensures there are no drastic changes for workload handling which
|
||||
may break existing, well-oiled DIY solutions or experiences with the existing
|
||||
more traditional workloads.
|
||||
|
||||
Many error handling techniques used today work for AI/ML, but are very
|
||||
expensive. SIG Node will invest in extension points to make those cheaper, with
|
||||
the understanding that the price cutting for AI/ML is critical.
|
||||
|
||||
The following is the set of specific investments we envision for various failure
|
||||
modes.
|
||||
|
||||
### Roadmap for failure modes: K8s infrastructure
|
||||
|
||||
The area of Kubernetes infrastructure is the easiest to understand and very
|
||||
important to make right for the upcoming transition from Device Plugins to DRA.
|
||||
SIG Node is tracking many work items in this area, most notably the following:
|
||||
|
||||
* [integrate kubelet with the systemd watchdog · Issue
|
||||
#127460](https://github.com/kubernetes/kubernetes/issues/127460)
|
||||
* [DRA: detect stale DRA plugin sockets · Issue
|
||||
#128696](https://github.com/kubernetes/kubernetes/issues/128696)
|
||||
* [Support takeover for devicemanager/device-plugin · Issue
|
||||
#127803](https://github.com/kubernetes/kubernetes/issues/127803)
|
||||
* [Kubelet plugin registration reliability · Issue
|
||||
#127457](https://github.com/kubernetes/kubernetes/issues/127457)
|
||||
* [Recreate the Device Manager gRPC server if failed · Issue
|
||||
#128167](https://github.com/kubernetes/kubernetes/issues/128167)
|
||||
* [Retry pod admission on device plugin grpc failures · Issue
|
||||
#128043](https://github.com/kubernetes/kubernetes/issues/128043)
|
||||
|
||||
Basically, every interaction of Kubernetes components must be reliable via
|
||||
either the kubelet improvements or the best practices in plugins development
|
||||
and deployment.
|
||||
|
||||
### Roadmap for failure modes: device failed
|
||||
|
||||
For the device failures some patterns are already emerging in common scenarios
|
||||
that Kubernetes can support. However, the very first step is to make information
|
||||
about failed devices available easier. The very first step here is the work in
|
||||
[KEP 4680](https://kep.k8s.io/4680) (Add Resource Health Status to the Pod Status for
|
||||
Device Plugin and DRA).
|
||||
|
||||
Longer term ideas include to be tested:
|
||||
|
||||
* Integrate device failures into Pod Failure Policy.
|
||||
* Node-local retry policies, enabling pod failure policies for Pods with
|
||||
restartPolicy=OnFailure and possibly beyond that.
|
||||
* Ability to _deschedule_ pod, including with the `restartPolicy: Always`, so it can
|
||||
get a new device allocated.
|
||||
* Add device health to the ResourceSlice used to represent devices in DRA,
|
||||
rather than simply withdrawing an unhealthy device from the ResourceSlice.
|
||||
|
||||
### Roadmap for failure modes: container code failed
|
||||
|
||||
The main improvements to handle container code failures for AI/ML workloads are
|
||||
all targeting cheaper error handling and recovery. The cheapness is mostly
|
||||
coming from reuse of pre-allocated resources as much as possible. From reusing
|
||||
the Pods by restarting containers in-place, to node local restart of containers
|
||||
instead of rescheduling whenever possible, to snapshotting support, and
|
||||
re-scheduling prioritizing the same node to save on image pulls.
|
||||
|
||||
Consider this scenario: A big training job needs 512 Pods to run. And one of the
|
||||
pods failed. It means that all Pods need to be interrupted and synced up to
|
||||
restart the failed step. The most efficient way to achieve this generally is to
|
||||
reuse as many Pods as possible by restarting them in-place, while replacing the
|
||||
failed pod to clear up the error from it. Like demonstrated in this picture:
|
||||
|
||||
{{< figure src="inplace-pod-restarts.svg" alt="The picture shows 512 pod, most ot them are green and have a recycle sign next to them indicating that they can be reused, and one Pod drawn in red, and a new green replacement Pod next to it indicating that it needs to be replaced." >}}
|
||||
|
||||
It is possible to implement this scenario, but all solutions implementing it are
|
||||
fragile due to lack of certain extension points in Kubernetes. Adding these
|
||||
extension points to implement this scenario is on the Kubernetes roadmap.
|
||||
|
||||
### Roadmap for failure modes: device degradation
|
||||
|
||||
There is very little done in this area - there is no clear detection signal,
|
||||
very limited troubleshooting tooling, and no built-in semantics to express the
|
||||
"degraded" device on Kubernetes. There has been discussion of adding data on
|
||||
device performance or degradation in the ResourceSlice used by DRA to represent
|
||||
devices, but it is not yet clearly defined. There are also projects like
|
||||
[node-healthcheck-operator](https://github.com/medik8s/node-healthcheck-operator)
|
||||
that can be used for some scenarios.
|
||||
|
||||
We expect developments in this area from hardware vendors and cloud providers, and we expect to see mostly DIY
|
||||
solutions in the near future. As more users get exposed to AI/ML workloads, this
|
||||
is a space needing feedback on patterns used here.
|
||||
|
||||
## Join the conversation
|
||||
|
||||
The Kubernetes community encourages feedback and participation in shaping the
|
||||
future of device failure handling. Join SIG Node and contribute to the ongoing
|
||||
discussions!
|
||||
|
||||
This blog post provides a high-level overview of the challenges and future
|
||||
directions for device failure management in Kubernetes. By addressing these
|
||||
issues, Kubernetes can solidify its position as the leading platform for AI/ML
|
||||
workloads, ensuring resilience and reliability for applications that depend on
|
||||
specialized hardware.
|
After Width: | Height: | Size: 30 KiB |
After Width: | Height: | Size: 72 KiB |
After Width: | Height: | Size: 194 KiB |
|
@ -0,0 +1,216 @@
|
|||
---
|
||||
layout: blog
|
||||
title: "Post-Quantum Cryptography in Kubernetes"
|
||||
slug: pqc-in-k8s
|
||||
date: 2025-07-18
|
||||
canonicalUrl: https://www.kubernetes.dev/blog/2025/07/18/pqc-in-k8s/
|
||||
author: "Fabian Kammel (ControlPlane)"
|
||||
draft: false
|
||||
---
|
||||
|
||||
The world of cryptography is on the cusp of a major shift with the advent of
|
||||
quantum computing. While powerful quantum computers are still largely
|
||||
theoretical for many applications, their potential to break current
|
||||
cryptographic standards is a serious concern, especially for long-lived
|
||||
systems. This is where _Post-Quantum Cryptography_ (PQC) comes in. In this
|
||||
article, I\'ll dive into what PQC means for TLS and, more specifically, for the
|
||||
Kubernetes ecosystem. I'll explain what the (suprising) state of PQC in
|
||||
Kubernetes is and what the implications are for current and future clusters.
|
||||
|
||||
## What is Post-Quantum Cryptography
|
||||
|
||||
Post-Quantum Cryptography refers to cryptographic algorithms that are thought to
|
||||
be secure against attacks by both classical and quantum computers. The primary
|
||||
concern is that quantum computers, using algorithms like [Shor\'s Algorithm],
|
||||
could efficiently break widely used public-key cryptosystems such as RSA and
|
||||
Elliptic Curve Cryptography (ECC), which underpin much of today\'s secure
|
||||
communication, including TLS. The industry is actively working on standardizing
|
||||
and adopting PQC algorithms. One of the first to be standardized by [NIST] is
|
||||
the Module-Lattice Key Encapsulation Mechanism (`ML-KEM`), formerly known as
|
||||
Kyber, and now standardized as [FIPS\-203] (PDF download).
|
||||
|
||||
It is difficult to predict when quantum computers will be able to break
|
||||
classical algorithms. However, it is clear that we need to start migrating to
|
||||
PQC algorithms now, as the next section shows. To get a feeling for the
|
||||
predicted timeline we can look at a [NIST report] covering the transition to
|
||||
post-quantum cryptography standards. It declares that system with classical
|
||||
crypto should be deprecated after 2030 and disallowed after 2035.
|
||||
|
||||
## Key exchange vs. digital signatures: different needs, different timelines {#timelines}
|
||||
|
||||
In TLS, there are two main cryptographic operations we need to secure:
|
||||
|
||||
**Key Exchange**: This is how the client and server agree on a shared secret to
|
||||
encrypt their communication. If an attacker records encrypted traffic today,
|
||||
they could decrypt it in the future, if they gain access to a quantum computer
|
||||
capable of breaking the key exchange. This makes migrating KEMs to PQC an
|
||||
immediate priority.
|
||||
|
||||
**Digital Signatures**: These are primarily used to authenticate the server (and
|
||||
sometimes the client) via certificates. The authenticity of a server is
|
||||
verified at the time of connection. While important, the risk of an attack
|
||||
today is much lower, because the decision of trusting a server cannot be abused
|
||||
after the fact. Additionally, current PQC signature schemes often come with
|
||||
significant computational overhead and larger key/signature sizes compared to
|
||||
their classical counterparts.
|
||||
|
||||
Another significant hurdle in the migration to PQ certificates is the upgrade
|
||||
of root certificates. These certificates have long validity periods and are
|
||||
installed in many devices and operating systems as trust anchors.
|
||||
|
||||
Given these differences, the focus for immediate PQC adoption in TLS has been
|
||||
on hybrid key exchange mechanisms. These combine a classical algorithm (such as
|
||||
Elliptic Curve Diffie-Hellman Ephemeral (ECDHE)) with a PQC algorithm (such as
|
||||
`ML-KEM`). The resulting shared secret is secure as long as at least one of the
|
||||
component algorithms remains unbroken. The `X25519MLKEM768` hybrid scheme is the
|
||||
most widely supported one.
|
||||
|
||||
## State of PQC key exchange mechanisms (KEMs) today {#state-of-kems}
|
||||
|
||||
Support for PQC KEMs is rapidly improving across the ecosystem.
|
||||
|
||||
**Go**: The Go standard library\'s `crypto/tls` package introduced support for
|
||||
`X25519MLKEM768` in version 1.24 (released February 2025). Crucially, it\'s
|
||||
enabled by default when there is no explicit configuration, i.e.,
|
||||
`Config.CurvePreferences` is `nil`.
|
||||
|
||||
**Browsers & OpenSSL**: Major browsers like Chrome (version 131, November 2024)
|
||||
and Firefox (version 135, February 2025), as well as OpenSSL (version 3.5.0,
|
||||
April 2025), have also added support for the `ML-KEM` based hybrid scheme.
|
||||
|
||||
Apple is also [rolling out support][ApplePQC] for `X25519MLKEM768` in version
|
||||
26 of their operating systems. Given the proliferation of Apple devices, this
|
||||
will have a significant impact on the global PQC adoption.
|
||||
|
||||
For a more detailed overview of the state of PQC in the wider industry,
|
||||
see [this blog post by Cloudflare][PQC2024].
|
||||
|
||||
## Post-quantum KEMs in Kubernetes: an unexpected arrival
|
||||
|
||||
So, what does this mean for Kubernetes? Kubernetes components, including the
|
||||
API server and kubelet, are built with Go.
|
||||
|
||||
As of Kubernetes v1.33, released in April 2025, the project uses Go 1.24. A
|
||||
quick check of the Kubernetes codebase reveals that `Config.CurvePreferences`
|
||||
is not explicitly set. This leads to a fascinating conclusion: Kubernetes
|
||||
v1.33, by virtue of using Go 1.24, supports hybrid post-quantum
|
||||
`X25519MLKEM768` for TLS connections by default!
|
||||
|
||||
You can test this yourself. If you set up a Minikube cluster running Kubernetes
|
||||
v1.33.0, you can connect to the API server using a recent OpenSSL client:
|
||||
|
||||
```console
|
||||
$ minikube start --kubernetes-version=v1.33.0
|
||||
$ kubectl cluster-info
|
||||
Kubernetes control plane is running at https://127.0.0.1:<PORT>
|
||||
$ kubectl config view --minify --raw -o jsonpath=\'{.clusters[0].cluster.certificate-authority-data}\' | base64 -d > ca.crt
|
||||
$ openssl version
|
||||
OpenSSL 3.5.0 8 Apr 2025 (Library: OpenSSL 3.5.0 8 Apr 2025)
|
||||
$ echo -n "Q" | openssl s_client -connect 127.0.0.1:<PORT> -CAfile ca.crt
|
||||
[...]
|
||||
Negotiated TLS1.3 group: X25519MLKEM768
|
||||
[...]
|
||||
DONE
|
||||
```
|
||||
|
||||
Lo and behold, the negotiated group is `X25519MLKEM768`! This is a significant
|
||||
step towards making Kubernetes quantum-safe, seemingly without a major
|
||||
announcement or dedicated KEP (Kubernetes Enhancement Proposal).
|
||||
|
||||
## The Go version mismatch pitfall
|
||||
|
||||
An interesting wrinkle emerged with Go versions 1.23 and 1.24. Go 1.23
|
||||
included experimental support for a draft version of `ML-KEM`, identified as
|
||||
`X25519Kyber768Draft00`. This was also enabled by default if
|
||||
`Config.CurvePreferences` was `nil`. Kubernetes v1.32 used Go 1.23. However,
|
||||
Go 1.24 removed the draft support and replaced it with the standardized version
|
||||
`X25519MLKEM768`.
|
||||
|
||||
What happens if a client and server are using mismatched Go versions (one on
|
||||
1.23, the other on 1.24)? They won\'t have a common PQC KEM to negotiate, and
|
||||
the handshake will fall back to classical ECC curves (e.g., `X25519`). How
|
||||
could this happen in practice?
|
||||
|
||||
Consider a scenario:
|
||||
|
||||
A Kubernetes cluster is running v1.32 (using Go 1.23 and thus
|
||||
`X25519Kyber768Draft00`). A developer upgrades their `kubectl` to v1.33,
|
||||
compiled with Go 1.24, only supporting `X25519MLKEM768`. Now, when `kubectl`
|
||||
communicates with the v1.32 API server, they no longer share a common PQC
|
||||
algorithm. The connection will downgrade to classical cryptography, silently
|
||||
losing the PQC protection that has been in place. This highlights the
|
||||
importance of understanding the implications of Go version upgrades, and the
|
||||
details of the TLS stack.
|
||||
|
||||
## Limitations: packet size {#limitation-packet-size}
|
||||
|
||||
One practical consideration with `ML-KEM` is the size of its public keys
|
||||
with encoded key sizes of around 1.2 kilobytes for `ML-KEM-768`.
|
||||
This can cause the initial TLS `ClientHello` message not to fit inside
|
||||
a single TCP/IP packet, given the typical networking constraints
|
||||
(most commonly, the standard Ethernet frame size limit of 1500
|
||||
bytes). Some TLS libraries or network appliances might not handle this
|
||||
gracefully, assuming the Client Hello always fits in one packet. This issue
|
||||
has been observed in some Kubernetes-related projects and networking
|
||||
components, potentially leading to connection failures when PQC KEMs are used.
|
||||
More details can be found at [tldr.fail].
|
||||
|
||||
## State of Post-Quantum Signatures
|
||||
|
||||
While KEMs are seeing broader adoption, PQC digital signatures are further
|
||||
behind in terms of widespread integration into standard toolchains. NIST has
|
||||
published standards for PQC signatures, such as `ML-DSA` (`FIPS-204`) and
|
||||
`SLH-DSA` (`FIPS-205`). However, implementing these in a way that\'s broadly
|
||||
usable (e.g., for PQC Certificate Authorities) [presents challenges]:
|
||||
|
||||
**Larger Keys and Signatures**: PQC signature schemes often have significantly
|
||||
larger public keys and signature sizes compared to classical algorithms like
|
||||
Ed25519 or RSA. For instance, Dilithium2 keys can be 30 times larger than
|
||||
Ed25519 keys, and certificates can be 12 times larger.
|
||||
|
||||
**Performance**: Signing and verification operations [can be substantially slower].
|
||||
While some algorithms are on par with classical algorithms, others may have a
|
||||
much higher overhead, sometimes on the order of 10x to 1000x worse performance.
|
||||
To improve this situation, NIST is running a
|
||||
[second round of standardization][NIST2ndRound] for PQC signatures.
|
||||
|
||||
**Toolchain Support**: Mainstream TLS libraries and CA software do not yet have
|
||||
mature, built-in support for these new signature algorithms. The Go team, for
|
||||
example, has indicated that `ML-DSA` support is a high priority, but the
|
||||
soonest it might appear in the standard library is Go 1.26 [(as of May 2025)].
|
||||
|
||||
[Cloudflare\'s CIRCL] (Cloudflare Interoperable Reusable Cryptographic Library)
|
||||
library implements some PQC signature schemes like variants of Dilithium, and
|
||||
they maintain a [fork of Go (cfgo)] that integrates CIRCL. Using `cfgo`, it\'s
|
||||
possible to experiment with generating certificates signed with PQC algorithms
|
||||
like Ed25519-Dilithium2. However, this requires using a custom Go toolchain and
|
||||
is not yet part of the mainstream Kubernetes or Go distributions.
|
||||
|
||||
## Conclusion
|
||||
|
||||
The journey to a post-quantum secure Kubernetes is underway, and perhaps
|
||||
further along than many realize, thanks to the proactive adoption of `ML-KEM`
|
||||
in Go. With Kubernetes v1.33, users are already benefiting from hybrid post-quantum key
|
||||
exchange in many TLS connections by default.
|
||||
|
||||
However, awareness of potential pitfalls, such as Go version mismatches leading
|
||||
to downgrades and issues with Client Hello packet sizes, is crucial. While PQC
|
||||
for KEMs is becoming a reality, PQC for digital signatures and certificate
|
||||
hierarchies is still in earlier stages of development and adoption for
|
||||
mainstream use. As Kubernetes maintainers and contributors, staying informed
|
||||
about these developments will be key to ensuring the long-term security of the
|
||||
platform.
|
||||
|
||||
[Shor\'s Algorithm]: https://en.wikipedia.org/wiki/Shor%27s_algorithm
|
||||
[NIST]: https://www.nist.gov/
|
||||
[FIPS\-203]: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf
|
||||
[NIST report]: https://nvlpubs.nist.gov/nistpubs/ir/2024/NIST.IR.8547.ipd.pdf
|
||||
[tldr.fail]: https://tldr.fail/
|
||||
[presents challenges]: https://blog.cloudflare.com/another-look-at-pq-signatures/#the-algorithms
|
||||
[can be substantially slower]: https://pqshield.github.io/nist-sigs-zoo/
|
||||
[(as of May 2025)]: https://github.com/golang/go/issues/64537#issuecomment-2877714729
|
||||
[Cloudflare\'s CIRCL]: https://github.com/cloudflare/circl
|
||||
[fork of Go (cfgo)]: https://github.com/cloudflare/go
|
||||
[PQC2024]: https://blog.cloudflare.com/pq-2024/
|
||||
[NIST2ndRound]: https://csrc.nist.gov/news/2024/pqc-digital-signature-second-round-announcement
|
||||
[ApplePQC]: https://support.apple.com/en-lb/122756
|
|
@ -0,0 +1,23 @@
|
|||
---
|
||||
title: Careers
|
||||
bigheader: Careers in Kubernetes
|
||||
abstract: Jobs focused on Kubernetes and Cloud Native Patterns
|
||||
class: gridPage
|
||||
cid: careers
|
||||
body_class: careers
|
||||
menu:
|
||||
main:
|
||||
weight: 70
|
||||
---
|
||||
|
||||
<div class="d-flex flex-column justify-content-center mt-4 mt-md-5 px-2 px-md-3 px-lg-0">
|
||||
<iframe id="gitjobs" class="mx-auto" src="https://gitjobs.dev/embed?ts_query=kubernetes" style="width:100%;max-width:870px;height:100%;display:block;border:none;"></iframe>
|
||||
<div class="mb-4 mb-md-5 mt-1 mx-auto gitjobs-legend">
|
||||
Powered by <a href="https://gitjobs.dev" target="_blank">GitJobs</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="module">
|
||||
import { initialize } from "https://cdn.jsdelivr.net/npm/@open-iframe-resizer/core@latest/dist/index.js";
|
||||
initialize({}, "#gitjobs");
|
||||
</script>
|
|
@ -27,7 +27,7 @@ case_study_details:
|
|||
<p>"Every single product, every decision we make at Ancestry, focuses on delighting our customers with intimate, sometimes life-changing discoveries about themselves and their families," says MacKay. "As the company continues to grow, the increased productivity gains from using Kubernetes has helped Ancestry make customer discoveries faster. With the move to Dockerization for example, instead of taking between 20 to 50 minutes to deploy a new piece of code, we can now deploy in under a minute for much of our code. We've truly experienced significant time savings in addition to the various features and benefits from cloud native and Kubernetes-type technologies."</p>
|
||||
|
||||
{{< case-studies/quote author="PAUL MACKAY, SOFTWARE ENGINEER AND ARCHITECT AT ANCESTRY" >}}
|
||||
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
|
||||
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
|
||||
{{< /case-studies/quote >}}
|
||||
|
||||
{{< case-studies/lead >}}
|
||||
|
@ -48,7 +48,7 @@ It started with a Shaky Leaf.
|
|||
|
||||
<p>That need led them in 2015 to explore containerization. Ancestry engineers had already been using technology like <a href="https://www.java.com/en/">Java</a> and <a href="https://www.python.org">Python</a> on Linux, so part of the decision was about making the infrastructure more Linux-friendly. They quickly decided that they wanted to go with Docker for containerization, "but it always comes down to the orchestration part of it to make it really work," says MacKay.</p>
|
||||
|
||||
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="http://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
|
||||
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="https://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
|
||||
|
||||
{{< case-studies/lead >}}
|
||||
Plus, MacKay says, "I just believed in the confidence that comes with the history that Google has with containerization. So we started out right on the leading edge of it. And we haven't looked back since."
|
||||
|
|
|
@ -42,9 +42,9 @@ With its end-to-end commerce platform for cloud-based products and services, <a
|
|||
|
||||
<p>When Director of Software Development Pierre-Alexandre Lacerte started working there in 2014, the company had a monolith application deployed on a "tomcat infrastructure, and the whole release process was complex for what it should be," he says. "There were a lot of manual steps involved, with one engineer building a feature then creating a pull request, and a QA or another engineer validating the feature. Then it gets merged and someone else will take care of the deployment. So we had bottlenecks in the pipeline to ship a feature to production."</p>
|
||||
|
||||
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="http://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
|
||||
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="https://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
|
||||
|
||||
{{< case-studies/quote
|
||||
{{< case-studies/quote
|
||||
image="/images/case-studies/appdirect/banner3.jpg"
|
||||
author="Alexandre Gervais, Staff Software Developer, AppDirect"
|
||||
>}}
|
||||
|
@ -61,7 +61,7 @@ With its end-to-end commerce platform for cloud-based products and services, <a
|
|||
|
||||
<p>Lacerte's strategy ultimately worked because of the very real impact the Kubernetes platform has had to deployment time. Due to less dependency on custom-made, brittle shell scripts with SCP commands, time to deploy a new version has shrunk from 4 hours to a few minutes. Additionally, the company invested a lot of effort to make things self-service for developers. "Onboarding a new service doesn't require <a href="https://www.atlassian.com/software/jira">Jira</a> tickets or meeting with three different teams," says Lacerte. Today, the company sees 1,600 deployments per week, compared to 1-30 before.</p>
|
||||
|
||||
{{< case-studies/quote
|
||||
{{< case-studies/quote
|
||||
image="/images/case-studies/appdirect/banner4.jpg"
|
||||
author="Pierre-Alexandre Lacerte, Director of Software Development, AppDirect"
|
||||
>}}
|
||||
|
|
|
@ -20,7 +20,7 @@ case_study_details:
|
|||
|
||||
<h2>Solution</h2>
|
||||
|
||||
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="http://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
|
||||
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="https://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
|
||||
|
||||
<h2>Impact</h2>
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ case_study_details:
|
|||
|
||||
<h2>Solution</h2>
|
||||
|
||||
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="http://kubernetes.io/">Kubernetes.</a></p>
|
||||
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="https://kubernetes.io/">Kubernetes.</a></p>
|
||||
|
||||
<h2>Impact</h2>
|
||||
|
||||
|
@ -50,7 +50,7 @@ It's not every day that you can say you've slashed an operating expense by half.
|
|||
|
||||
<p>GolfNow's dev team ran an "internal, low-key" proof of concept and were won over. "We really liked how easy it was to be able to pass containers around to each other and have them up and running in no time, exactly the way it was running on my machine," says Sheriff. "Because that is always the biggest gripe that Ops has with developers, right? 'It worked on my machine!' But then we started getting to the point of, 'How do we make sure that these things stay up and running?'"</p>
|
||||
|
||||
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="http://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
|
||||
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="https://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
|
||||
|
||||
<p>But before they could go with Kubernetes, <a href="http://www.nbc.com/">NBC</a>, GolfNow's parent company, also asked them to comparison shop with another company. Sheriff and his team liked the competing company's platform user interface, but didn't like that its platform would not allow containers to run natively on Docker. With no clear decision in sight, Sheriff's VP at GolfNow, Steve McElwee, set up a three-month trial during which a GolfNow team (consisting of Sheriff and Josh, who's now Lead Architect, Open Platforms) would build out a Kubernetes environment, and a large NBC team would build out one with the other company's platform.</p>
|
||||
|
||||
|
|
|
@ -53,6 +53,9 @@ menu:
|
|||
<div class="community-nav-item">
|
||||
<a href="/releases">Releases</a>
|
||||
</div>
|
||||
<div class="community-nav-item">
|
||||
<a href="/case-studies">Case Studies</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="community-section" id="gallery">
|
||||
|
@ -124,12 +127,12 @@ menu:
|
|||
troubleshooting, and so much more.</p>
|
||||
</div>
|
||||
|
||||
<div id="twitter" class="community-resource">
|
||||
<a href="https://twitter.com/kubernetesio">
|
||||
<img src="/images/community/x-org.png" alt="𝕏.org">
|
||||
<div id="bluesky" class="community-resource">
|
||||
<a href="https://bsky.app/profile/kubernetes.io">
|
||||
<img src="/images/community/bluesky.png" alt="Bluesky">
|
||||
</a>
|
||||
<a href="https://twitter.com/kubernetesio">𝕏 ▶</a>
|
||||
<p><em>#kubernetesio</em></p>
|
||||
<a href="https://bsky.app/profile/kubernetes.io">Bluesky ▶</a>
|
||||
<p><em>@kubernetes.io</em></p>
|
||||
<p>Real-time announcements of blog posts, events, news, ideas.</p>
|
||||
</div>
|
||||
|
||||
|
@ -159,6 +162,15 @@ menu:
|
|||
Visit <a href="https://slack.k8s.io/">https://slack.k8s.io/</a>
|
||||
for an invitation.</details>
|
||||
</div>
|
||||
|
||||
<div id="twitter" class="community-resource">
|
||||
<a href="https://x.com/kubernetesio">
|
||||
<img src="/images/community/x-org.png" alt="X">
|
||||
</a>
|
||||
<a href="https://x.com/kubernetesio">𝕏 ▶</a>
|
||||
<p><em>@kubernetesio</em></p>
|
||||
<p>Real-time announcements of blog posts, events, news, ideas.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -26,25 +26,22 @@ each Node in your cluster, so that the
|
|||
The kubelet acts as a client when connecting to the container runtime via gRPC.
|
||||
The runtime and image service endpoints have to be available in the container
|
||||
runtime, which can be configured separately within the kubelet by using the
|
||||
`--image-service-endpoint` [command line flags](/docs/reference/command-line-tools-reference/kubelet).
|
||||
`--container-runtime-endpoint`
|
||||
[command line flag](/docs/reference/command-line-tools-reference/kubelet/).
|
||||
|
||||
For Kubernetes v{{< skew currentVersion >}}, the kubelet prefers to use CRI `v1`.
|
||||
If a container runtime does not support `v1` of the CRI, then the kubelet tries to
|
||||
negotiate any older supported version.
|
||||
The v{{< skew currentVersion >}} kubelet can also negotiate CRI `v1alpha2`, but
|
||||
this version is considered as deprecated.
|
||||
If the kubelet cannot negotiate a supported CRI version, the kubelet gives up
|
||||
and doesn't register as a node.
|
||||
For Kubernetes v1.26 and later, the kubelet requires that the container runtime
|
||||
supports the `v1` CRI API. If a container runtime does not support the `v1` API,
|
||||
the kubelet will not register the node.
|
||||
|
||||
## Upgrading
|
||||
|
||||
When upgrading Kubernetes, the kubelet tries to automatically select the
|
||||
latest CRI version on restart of the component. If that fails, then the fallback
|
||||
will take place as mentioned above. If a gRPC re-dial was required because the
|
||||
container runtime has been upgraded, then the container runtime must also
|
||||
support the initially selected version or the redial is expected to fail. This
|
||||
requires a restart of the kubelet.
|
||||
When upgrading the Kubernetes version on a node, the kubelet restarts. If the
|
||||
container runtime does not support the `v1` CRI API, the kubelet will fail to
|
||||
register and report an error. If a gRPC re-dial is required because the container
|
||||
runtime has been upgraded, the runtime must support the `v1` CRI API for the
|
||||
connection to succeed. This might require a restart of the kubelet after the
|
||||
container runtime is correctly configured.
|
||||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
- Learn more about the CRI [protocol definition](https://github.com/kubernetes/cri-api/blob/c75ef5b/pkg/apis/runtime/v1/api.proto)
|
||||
- Learn more about the CRI [protocol definition](https://github.com/kubernetes/cri-api/blob/v0.33.1/pkg/apis/runtime/v1/api.proto)
|
||||
|
|
|
@ -296,63 +296,6 @@ the kubelet can use topology hints when making resource assignment decisions.
|
|||
See [Control Topology Management Policies on a Node](/docs/tasks/administer-cluster/topology-manager/)
|
||||
for more information.
|
||||
|
||||
## Swap memory management {#swap-memory}
|
||||
|
||||
{{< feature-state feature_gate_name="NodeSwap" >}}
|
||||
|
||||
To enable swap on a node, the `NodeSwap` feature gate must be enabled on
|
||||
the kubelet (default is true), and the `--fail-swap-on` command line flag or `failSwapOn`
|
||||
[configuration setting](/docs/reference/config-api/kubelet-config.v1beta1/)
|
||||
must be set to false.
|
||||
To allow Pods to utilize swap, `swapBehavior` should not be set to `NoSwap` (which is the default behavior) in the kubelet config.
|
||||
|
||||
{{< warning >}}
|
||||
When the memory swap feature is turned on, Kubernetes data such as the content
|
||||
of Secret objects that were written to tmpfs now could be swapped to disk.
|
||||
{{< /warning >}}
|
||||
|
||||
A user can also optionally configure `memorySwap.swapBehavior` in order to
|
||||
specify how a node will use swap memory. For example,
|
||||
|
||||
```yaml
|
||||
memorySwap:
|
||||
swapBehavior: LimitedSwap
|
||||
```
|
||||
|
||||
- `NoSwap` (default): Kubernetes workloads will not use swap.
|
||||
- `LimitedSwap`: The utilization of swap memory by Kubernetes workloads is subject to limitations.
|
||||
Only Pods of Burstable QoS are permitted to employ swap.
|
||||
|
||||
If configuration for `memorySwap` is not specified and the feature gate is
|
||||
enabled, by default the kubelet will apply the same behaviour as the
|
||||
`NoSwap` setting.
|
||||
|
||||
With `LimitedSwap`, Pods that do not fall under the Burstable QoS classification (i.e.
|
||||
`BestEffort`/`Guaranteed` Qos Pods) are prohibited from utilizing swap memory.
|
||||
To maintain the aforementioned security and node health guarantees, these Pods
|
||||
are not permitted to use swap memory when `LimitedSwap` is in effect.
|
||||
|
||||
Prior to detailing the calculation of the swap limit, it is necessary to define the following terms:
|
||||
|
||||
* `nodeTotalMemory`: The total amount of physical memory available on the node.
|
||||
* `totalPodsSwapAvailable`: The total amount of swap memory on the node that is available for use by Pods
|
||||
(some swap memory may be reserved for system use).
|
||||
* `containerMemoryRequest`: The container's memory request.
|
||||
|
||||
Swap limitation is configured as:
|
||||
`(containerMemoryRequest / nodeTotalMemory) * totalPodsSwapAvailable`.
|
||||
|
||||
It is important to note that, for containers within Burstable QoS Pods, it is possible to
|
||||
opt-out of swap usage by specifying memory requests that are equal to memory limits.
|
||||
Containers configured in this manner will not have access to swap memory.
|
||||
|
||||
Swap is supported only with **cgroup v2**, cgroup v1 is not supported.
|
||||
|
||||
For more information, and to assist with testing and provide feedback, please
|
||||
see the blog-post about [Kubernetes 1.28: NodeSwap graduates to Beta1](/blog/2023/08/24/swap-linux-beta/),
|
||||
[KEP-2400](https://github.com/kubernetes/enhancements/issues/4128) and its
|
||||
[design proposal](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md).
|
||||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
Learn more about the following:
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
---
|
||||
title: Kubernetes Self-Healing
|
||||
content_type: concept
|
||||
Weight: 50
|
||||
weight: 50
|
||||
feature:
|
||||
title: Self-healing
|
||||
anchor: Automated recovery from damage
|
||||
description: >
|
||||
Kubernetes restarts containers that crash, replaces entire Pods where needed,
|
||||
reattaches storage in response to wider failures, and can integrate with
|
||||
node autoscalers to self-heal even at the node level.
|
||||
---
|
||||
<!-- overview -->
|
||||
|
||||
|
|
|
@ -82,6 +82,9 @@ installation instructions. The list does not try to be exhaustive.
|
|||
* [Spiderpool](https://github.com/spidernet-io/spiderpool) is an underlay and RDMA
|
||||
networking solution for Kubernetes. Spiderpool is supported on bare metal, virtual machines,
|
||||
and public cloud environments.
|
||||
* [Terway](https://github.com/AliyunContainerService/terway/) is a suite of CNI plugins
|
||||
based on AlibabaCloud's VPC and ECS network products. It provides native VPC networking
|
||||
and network policies in AlibabaCloud environments.
|
||||
* [Weave Net](https://github.com/rajch/weave#using-weave-on-kubernetes)
|
||||
provides networking and network policy, will carry on working on both sides
|
||||
of a network partition, and does not require an external database.
|
||||
|
|
|
@ -0,0 +1,402 @@
|
|||
---
|
||||
title: Swap memory management
|
||||
content_type: concept
|
||||
weight: 10
|
||||
---
|
||||
|
||||
<!-- overview -->
|
||||
|
||||
Kubernetes can be configured to use swap memory on a {{< glossary_tooltip text="node" term_id="node" >}},
|
||||
allowing the kernel to free up physical memory by swapping out pages to backing storage.
|
||||
This is useful for multiple use-cases.
|
||||
For example, nodes running workloads that can benefit from using swap,
|
||||
such as those that have large memory footprints but only access a portion of that memory at any given time.
|
||||
It also helps prevent Pods from being terminated during memory pressure spikes,
|
||||
shields nodes from system-level memory spikes that might compromise its stability,
|
||||
allows for more flexible memory management on the node, and much more.
|
||||
|
||||
<!-- body -->
|
||||
|
||||
## How to use it?
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Swap must be enabled and provisioned on the node.
|
||||
- The node must run a Linux operating system.
|
||||
- The node must use cgroup v2. Kubernetes does not support swap on cgroup v1 nodes.
|
||||
|
||||
## Enabling swap for Kubernetes Workloads
|
||||
|
||||
To allow Kubernetes workloads to use swap,
|
||||
you must disable the kubelet's default behavior of failing when swap is detected,
|
||||
and specify memory-swap behavior as `LimitedSwap`:
|
||||
|
||||
**Update kubelet configuration:**
|
||||
```yaml
|
||||
# this fragment goes into the kubelet's configuration file
|
||||
failSwapOn: false
|
||||
memorySwap:
|
||||
swapBehavior: LimitedSwap
|
||||
```
|
||||
|
||||
The available choices for `swapBehavior` are:
|
||||
- `NoSwap` (default): Kubernetes workloads cannot use swap. However, processes
|
||||
outside of Kubernetes' scope, like system daemons (such as kubelet itself!) can utilize swap.
|
||||
This behavior is beneficial for protecting the node from system-level memory spikes,
|
||||
but it does not safeguard the workloads themselves from such spikes.
|
||||
- `LimitedSwap`: Kubernetes workloads can utilize swap memory.
|
||||
The amount of swap available to a Pod is determined automatically.
|
||||
For more details, see the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap).
|
||||
|
||||
If configuration for `memorySwap` is not specified,
|
||||
by default the kubelet will apply the same behaviour as the `NoSwap` setting.
|
||||
|
||||
Bear in mind that the following pods would be excluded from swap access
|
||||
(see more info in the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap)):
|
||||
- Pods that are not classified as Burstable QoS.
|
||||
- Pods of High-priority.
|
||||
- Containers with memory limit that equals to memory request.
|
||||
|
||||
{{< note >}}
|
||||
|
||||
Kubernetes only supports swap for Linux nodes.
|
||||
|
||||
{{< /note >}}
|
||||
|
||||
## How does it work?
|
||||
|
||||
There are a number of possible ways that one could envision swap use on a node.
|
||||
If kubelet is already running on a node, it would need to be restarted after swap is provisioned in order to identify it.
|
||||
|
||||
When kubelet starts on a node in which swap is provisioned and available
|
||||
(with the `failSwapOn: false` configuration), kubelet will:
|
||||
- Be able to start on this swap-enabled node.
|
||||
- Direct the Container Runtime Interface (CRI) implementation, often referred to as the container runtime,
|
||||
to allocate zero swap memory to Kubernetes workloads by default.
|
||||
|
||||
Swap configuration on a node is exposed to a cluster admin via the
|
||||
[`memorySwap` in the KubeletConfiguration](/docs/reference/config-api/kubelet-config.v1).
|
||||
As a cluster administrator, you can specify the node's behaviour in the
|
||||
presence of swap memory by setting `memorySwap.swapBehavior`.
|
||||
|
||||
The kubelet uses the container runtime API, and directs the container runtime to
|
||||
apply specific configuration (for example, in the cgroup v2 case, `memory.swap.max`) in a manner that will
|
||||
enable the desired swap configuration for a container. For runtimes that use control groups, or cgroups,
|
||||
the container runtime is then responsible for writing these settings to the container-level cgroup.
|
||||
|
||||
## Observability for swap use
|
||||
|
||||
### Node and container level metric statistics
|
||||
|
||||
Kubelet now collects node and container level metric statistics,
|
||||
which can be accessed at the `/metrics/resource` (which is used mainly by monitoring
|
||||
tools like Prometheus) and `/stats/summary` (which is used mainly by Autoscalers) kubelet HTTP endpoints.
|
||||
This allows clients who can directly request the kubelet to
|
||||
monitor swap usage and remaining swap memory when using `LimitedSwap`.
|
||||
Additionally, a `machine_swap_bytes` metric has been added to cadvisor to show
|
||||
the total physical swap capacity of the machine.
|
||||
See [this page](/docs/reference/instrumentation/node-metrics/) for more info.
|
||||
|
||||
For example, these `/metrics/resource` are supported:
|
||||
- `node_swap_usage_bytes`: Current swap usage of the node in bytes.
|
||||
- `container_swap_usage_bytes`: Current amount of the container swap usage in bytes.
|
||||
- `container_swap_limit_bytes`: Current amount of the container swap limit in bytes.
|
||||
|
||||
### Using `kubectl top --show-swap`
|
||||
|
||||
Querying metrics is valuable, but somewhat cumbersome, as these metrics
|
||||
are designed to be used by software rather than humans.
|
||||
In order to consume this data in a more user-friendly way,
|
||||
the `kubectl top` command has been extended to support swap metrics, using the `--show-swap` flag.
|
||||
|
||||
In order to receive information about swap usage on nodes, `kubectl top nodes --show-swap` can be used:
|
||||
```shell
|
||||
kubectl top nodes --show-swap
|
||||
```
|
||||
|
||||
This will result in an output similar to:
|
||||
```
|
||||
NAME CPU(cores) CPU(%) MEMORY(bytes) MEMORY(%) SWAP(bytes) SWAP(%)
|
||||
node1 1m 10% 2Mi 10% 1Mi 0%
|
||||
node2 5m 10% 6Mi 10% 2Mi 0%
|
||||
node3 3m 10% 4Mi 10% <unknown> <unknown>
|
||||
```
|
||||
|
||||
In order to receive information about swap usage by pods, `kubectl top nodes --show-swap` can be used:
|
||||
```shell
|
||||
kubectl top pod -n kube-system --show-swap
|
||||
```
|
||||
|
||||
This will result in an output similar to:
|
||||
```
|
||||
NAME CPU(cores) MEMORY(bytes) SWAP(bytes)
|
||||
coredns-58d5bc5cdb-5nbk4 2m 19Mi 0Mi
|
||||
coredns-58d5bc5cdb-jsh26 3m 37Mi 0Mi
|
||||
etcd-node01 51m 143Mi 5Mi
|
||||
kube-apiserver-node01 98m 824Mi 16Mi
|
||||
kube-controller-manager-node01 20m 135Mi 9Mi
|
||||
kube-proxy-ffgs2 1m 24Mi 0Mi
|
||||
kube-proxy-fhvwx 1m 39Mi 0Mi
|
||||
kube-scheduler-node01 13m 69Mi 0Mi
|
||||
metrics-server-8598789fdb-d2kcj 5m 26Mi 0Mi
|
||||
```
|
||||
|
||||
### Nodes to report swap capacity as part of node status
|
||||
|
||||
A new node status field is now added, `node.status.nodeInfo.swap.capacity`, to report the swap capacity of a node.
|
||||
|
||||
As an example, the following command can be used to retrieve the swap capacity of the nodes in a cluster:
|
||||
```shell
|
||||
kubectl get nodes -o go-template='{{range .items}}{{.metadata.name}}: {{if .status.nodeInfo.swap.capacity}}{{.status.nodeInfo.swap.capacity}}{{else}}<unknown>{{end}}{{"\n"}}{{end}}'
|
||||
```
|
||||
|
||||
This will result in an output similar to:
|
||||
```
|
||||
node1: 21474836480
|
||||
node2: 42949664768
|
||||
node3: <unknown>
|
||||
```
|
||||
|
||||
{{< note >}}
|
||||
|
||||
The `<unknown>` value indicates that the `.status.nodeInfo.swap.capacity` field is not set for that Node.
|
||||
This probably means that the node does not have swap provisioned, or less likely,
|
||||
that the kubelet is not able to determine the swap capacity of the node.
|
||||
|
||||
{{< /note >}}
|
||||
|
||||
### Swap discovery using Node Feature Discovery (NFD) {#node-feature-discovery}
|
||||
|
||||
[Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery)
|
||||
is a Kubernetes addon for detecting hardware features and configuration.
|
||||
It can be utilized to discover which nodes are provisioned with swap.
|
||||
|
||||
As an example, to figure out which nodes are provisioned with swap,
|
||||
use the following command:
|
||||
```shell
|
||||
kubectl get nodes -o jsonpath='{range .items[?(@.metadata.labels.feature\.node\.kubernetes\.io/memory-swap)]}{.metadata.name}{"\t"}{.metadata.labels.feature\.node\.kubernetes\.io/memory-swap}{"\n"}{end}'
|
||||
```
|
||||
|
||||
This will result in an output similar to:
|
||||
```
|
||||
k8s-worker1: true
|
||||
k8s-worker2: true
|
||||
k8s-worker3: false
|
||||
```
|
||||
|
||||
In this example, swap is provisioned on nodes `k8s-worker1` and `k8s-worker2`, but not on `k8s-worker3`.
|
||||
|
||||
## Risks and caveats
|
||||
|
||||
{{< caution >}}
|
||||
|
||||
It is deeply encouraged to encrypt the swap space.
|
||||
See Memory-backed volumes [memory-backed volumes](#memory-backed-volumes) for more info.
|
||||
|
||||
{{< /caution >}}
|
||||
|
||||
Having swap available on a system reduces predictability.
|
||||
While swap can enhance performance by making more RAM available, swapping data
|
||||
back to memory is a heavy operation, sometimes slower by many orders of magnitude,
|
||||
which can cause unexpected performance regressions.
|
||||
Furthermore, swap changes a system's behaviour under memory pressure.
|
||||
Enabling swap increases the risk of noisy neighbors,
|
||||
where Pods that frequently use their RAM may cause other Pods to swap.
|
||||
In addition, since swap allows for greater memory usage for workloads in Kubernetes that cannot be predictably accounted for,
|
||||
and due to unexpected packing configurations,
|
||||
the scheduler currently does not account for swap memory usage.
|
||||
This heightens the risk of noisy neighbors.
|
||||
|
||||
The performance of a node with swap memory enabled depends on the underlying physical storage.
|
||||
When swap memory is in use, performance will be significantly worse in an I/O
|
||||
operations per second (IOPS) constrained environment, such as a cloud VM with
|
||||
I/O throttling, when compared to faster storage mediums like solid-state drives
|
||||
or NVMe.
|
||||
As swap might cause IO pressure, it is recommended to give a higher IO latency
|
||||
priority to system critical daemons. See the relevant section in the
|
||||
[recommended practices](#good-practice-for-using-swap-in-a-kubernetes-cluster) section below.
|
||||
|
||||
### Memory-backed volumes
|
||||
|
||||
On Linux nodes, memory-backed volumes (such as [`secret`](/docs/concepts/configuration/secret/)
|
||||
volume mounts, or [`emptyDir`](/docs/concepts/storage/volumes/#emptydir) with `medium: Memory`)
|
||||
are implemented with a `tmpfs` filesystem.
|
||||
The contents of such volumes should remain in memory at all times, hence should
|
||||
not be swapped to disk.
|
||||
To ensure the contents of such volumes remain in memory, the `noswap` tmpfs option
|
||||
is being used.
|
||||
|
||||
The Linux kernel officially supports the `noswap` option from version 6.3 (more info
|
||||
can be found in [Linux Kernel Version Requirements](/docs/reference/node/kernel-version-requirements/#requirements-other)).
|
||||
However, the different distributions often choose to backport this mount option to older
|
||||
Linux versions as well.
|
||||
|
||||
In order to verify whether the node supports the `noswap` option, the kubelet will do the following:
|
||||
* If the kernel's version is above 6.3 then the `noswap` option will be assumed to be supported.
|
||||
* Otherwise, kubelet would try to mount a dummy tmpfs with the `noswap` option at startup.
|
||||
If kubelet fails with an error indicating of an unknown option, `noswap` will be assumed
|
||||
to not be supported, hence will not be used.
|
||||
A kubelet log entry will be emitted to warn the user about memory-backed volumes might swap to disk.
|
||||
If kubelet succeeds, the dummy tmpfs will be deleted and the `noswap` option will be used.
|
||||
* If the `noswap` option is not supported, kubelet will emit a warning log entry,
|
||||
then continue its execution.
|
||||
|
||||
See the [section above](#setting-up-encrypted-swap) with an example for setting unencrypted swap.
|
||||
However, handling encrypted swap is not within the scope of kubelet;
|
||||
rather, it is a general OS configuration concern and should be addressed at that level.
|
||||
It is the administrator's responsibility to provision encrypted swap to mitigate this risk.
|
||||
|
||||
### Evictions
|
||||
|
||||
Configuring memory eviction thresholds for swap-enabled nodes can be tricky.
|
||||
|
||||
With swap being disabled, it is reasonable to configure kubelet's eviction thresholds
|
||||
to be a bit lower than the node's memory capacity.
|
||||
The rationale is that we want Kubernetes to start evicting Pods before the node runs out of memory
|
||||
and invokes the Out Of Memory (OOM) killer, since the OOM killer is not Kubernetes-aware,
|
||||
therefore does not consider things like QoS, pod priority, or other Kubernetes-specific factors.
|
||||
|
||||
With swap enabled, the situation is more complex.
|
||||
In Linux, the `vm.min_free_kbytes` parameter defines the memory threshold for the kernel
|
||||
to start aggressively reclaiming memory, which includes swapping out pages.
|
||||
If the kubelet's eviction thresholds are set in a way that eviction would take place
|
||||
before the kernel starts reclaiming memory, it could lead to workloads never
|
||||
being able to swap out during node memory pressure.
|
||||
However, setting the eviction thresholds too high could result in the node running out of memory
|
||||
and invoking the OOM killer, which is not ideal either.
|
||||
|
||||
To address this, it is recommended to set the kubelet's eviction thresholds
|
||||
to be slightly lower than the `vm.min_free_kbytes` value.
|
||||
This way, the node can start swapping before kubelet would start evicting Pods,
|
||||
allowing workloads to swap out unused data and preventing evictions from happening.
|
||||
On the other hand, since it is just slightly lower, kubelet is likely to start evicting Pods
|
||||
before the node runs out of memory, thus avoiding the OOM killer.
|
||||
|
||||
The value of `vm.min_free_kbytes` can be determined by running the following command on the node:
|
||||
```shell
|
||||
cat /proc/sys/vm/min_free_kbytes
|
||||
```
|
||||
|
||||
### Unutilized swap space
|
||||
|
||||
Under the `LimitedSwap` behavior, the amount of swap available to a Pod is determined automatically,
|
||||
based on the proportion of the memory requested relative to the node's total memory
|
||||
(For more details, see the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap)).
|
||||
|
||||
This design means that usually there would be some portion of swap that will remain
|
||||
restricted for Kubernetes workloads.
|
||||
For example, since Guaranteed QoS pods are currently not permitted to use swap,
|
||||
the amount of swap that's proportional to the memory request will remain unused
|
||||
by Kubernetes workloads.
|
||||
|
||||
This behavior carries some risk in a situation where many pods are not eligible for swapping.
|
||||
On the other hand, it effectively keeps some system-reserved amount of swap memory that can be used by processes
|
||||
outside of Kubernetes' scope, such as system daemons and even kubelet itself.
|
||||
|
||||
## Good practice for using swap in a Kubernetes cluster
|
||||
|
||||
### Disable swap for system-critical daemons
|
||||
|
||||
During the testing phase and based on user feedback, it was observed that the performance
|
||||
of system-critical daemons and services might degrade.
|
||||
This implies that system daemons, including the kubelet, could operate slower than usual.
|
||||
If this issue is encountered, it is advisable to configure the cgroup of the system slice
|
||||
to prevent swapping (i.e., set `memory.swap.max=0`).
|
||||
|
||||
### Protect system-critical daemons for I/O latency
|
||||
|
||||
Swap can increase the I/O load on a node.
|
||||
When memory pressure causes the kernel to rapidly swap pages in and out,
|
||||
system-critical daemons and services that rely on I/O operations may
|
||||
experience performance degradation.
|
||||
|
||||
To mitigate this, it is recommended for systemd users to prioritize the system slice in terms of I/O latency.
|
||||
For non-systemd users,
|
||||
setting up a dedicated cgroup for system daemons and processes and prioritizing I/O latency in the same way is advised.
|
||||
This can be achieved by setting `io.latency` for the system slice,
|
||||
thereby granting it higher I/O priority.
|
||||
See [cgroup's documentation](https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v2.rst) for more info.
|
||||
|
||||
### Swap and control plane nodes
|
||||
|
||||
The Kubernetes project recommends running control plane nodes without any swap space configured.
|
||||
The control plane primarily hosts Guaranteed QoS Pods, so swap can generally be disabled.
|
||||
The main concern is that swapping critical services on the control plane could negatively impact performance.
|
||||
|
||||
### Use of a dedicated disk for swap
|
||||
|
||||
The Kubernetes project recommends using encrypted swap, whenever you run nodes with swap enabled.
|
||||
If swap resides on a partition or the root filesystem, workloads may interfere
|
||||
with system processes that need to write to disk.
|
||||
When they share the same disk, processes can overwhelm swap,
|
||||
disrupting the I/O of kubelet, container runtime, and systemd, which would impact other workloads.
|
||||
Since swap space is located on a disk, it is crucial to ensure the disk is fast enough for the intended use cases.
|
||||
Alternatively, one can configure I/O priorities between different mapped areas of a single backing device.
|
||||
|
||||
### Swap-aware scheduling
|
||||
|
||||
Kubernetes {{< skew currentVersion >}} does not support allocating Pods to nodes in a way that accounts
|
||||
for swap memory usage. The scheduler typically uses _requests_ for infrastructure resources
|
||||
to guide Pod placement, and Pods do not request swap space; they just request `memory`.
|
||||
This means that the scheduler does not consider swap memory when making scheduling decisions.
|
||||
While this is something we are actively working on, it is not yet implemented.
|
||||
|
||||
In order for administrators to ensure that Pods are not scheduled on nodes
|
||||
with swap memory unless they are specifically intended to use it,
|
||||
Administrators can taint nodes with swap available to protect against this problem.
|
||||
Taints will ensure that workloads which tolerate swap will not spill onto nodes without swap under load.
|
||||
|
||||
### Selecting storage for optimal performance
|
||||
|
||||
The storage device designated for swap space is critical to maintaining system responsiveness
|
||||
during high memory usage.
|
||||
Rotational hard disk drives (HDDs) are ill-suited for this task as their mechanical nature introduces significant latency,
|
||||
leading to severe performance degradation and system thrashing.
|
||||
For modern performance needs, a device such as a Solid State Drive (SSD) is probably the appropriate choice for swap,
|
||||
as its low-latency electronic access minimizes the slowdown.
|
||||
|
||||
|
||||
## Swap Behavior Details
|
||||
|
||||
### How is the swap limit being determined with LimitedSwap?
|
||||
|
||||
The configuration of swap memory, including its limitations, presents a significant
|
||||
challenge. Not only is it prone to misconfiguration, but as a system-level property, any
|
||||
misconfiguration could potentially compromise the entire node rather than just a specific
|
||||
workload. To mitigate this risk and ensure the health of the node, we have implemented
|
||||
Swap with automatic configuration of limitations.
|
||||
|
||||
With `LimitedSwap`, Pods that do not fall under the Burstable QoS classification (i.e.
|
||||
`BestEffort`/`Guaranteed` QoS Pods) are prohibited from utilizing swap memory.
|
||||
`BestEffort` QoS Pods exhibit unpredictable memory consumption patterns and lack
|
||||
information regarding their memory usage, making it difficult to determine a safe
|
||||
allocation of swap memory.
|
||||
Conversely, `Guaranteed` QoS Pods are typically employed for applications that rely on the
|
||||
precise allocation of resources specified by the workload, with memory being immediately available.
|
||||
To maintain the aforementioned security and node health guarantees,
|
||||
these Pods are not permitted to use swap memory when `LimitedSwap` is in effect.
|
||||
In addition, high-priority pods are not permitted to use swap in order to ensure the memory
|
||||
they consume always residents on disk, hence ready to use.
|
||||
|
||||
Prior to detailing the calculation of the swap limit, it is necessary to define the following terms:
|
||||
* `nodeTotalMemory`: The total amount of physical memory available on the node.
|
||||
* `totalPodsSwapAvailable`: The total amount of swap memory on the node that is available for use by Pods (some swap memory may be reserved for system use).
|
||||
* `containerMemoryRequest`: The container's memory request.
|
||||
|
||||
Swap limitation is configured as:
|
||||
( `containerMemoryRequest` / `nodeTotalMemory` ) × `totalPodsSwapAvailable`
|
||||
|
||||
In other words, the amount of swap that a container is able to use is proportionate to its
|
||||
memory request, the node's total physical memory and the total amount of swap memory on
|
||||
the node that is available for use by Pods.
|
||||
|
||||
It is important to note that, for containers within Burstable QoS Pods, it is possible to
|
||||
opt-out of swap usage by specifying memory requests that are equal to memory limits.
|
||||
Containers configured in this manner will not have access to swap memory.
|
||||
|
||||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
- You can check out a [blog post about Kubernetes and swap](/blog/2025/03/25/swap-linux-improvements/)
|
||||
- For more information, please see the original KEP, [KEP-2400](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2400-node-swap),
|
||||
and its [design](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md).
|
|
@ -41,13 +41,14 @@ receivers:
|
|||
grpc:
|
||||
exporters:
|
||||
# Replace this exporter with the exporter for your backend
|
||||
logging:
|
||||
logLevel: debug
|
||||
exporters:
|
||||
debug:
|
||||
verbosity: detailed
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
exporters: [logging]
|
||||
exporters: [debug]
|
||||
```
|
||||
|
||||
To directly emit traces to a backend without utilizing a collector,
|
||||
|
|
|
@ -12,7 +12,7 @@ hide_summary: true # Listed separately in section index
|
|||
|
||||
A container image represents binary data that encapsulates an application and all its
|
||||
software dependencies. Container images are executable software bundles that can run
|
||||
standalone and that make very well defined assumptions about their runtime environment.
|
||||
standalone and that make very well-defined assumptions about their runtime environment.
|
||||
|
||||
You typically create a container image of your application and push it to a registry
|
||||
before referring to it in a {{< glossary_tooltip text="Pod" term_id="pod" >}}.
|
||||
|
@ -34,7 +34,7 @@ Images can also include a registry hostname; for example: `fictional.registry.ex
|
|||
and possibly a port number as well; for example: `fictional.registry.example:10443/imagename`.
|
||||
|
||||
If you don't specify a registry hostname, Kubernetes assumes that you mean the [Docker public registry](https://hub.docker.com/).
|
||||
You can change this behaviour by setting default image registry in
|
||||
You can change this behavior by setting a default image registry in the
|
||||
[container runtime](/docs/setup/production-environment/container-runtimes/) configuration.
|
||||
|
||||
After the image name part you can add a _tag_ or _digest_ (in the same way you would when using with commands
|
||||
|
@ -43,40 +43,45 @@ Digests are a unique identifier for a specific version of an image. Digests are
|
|||
and are immutable. Tags can be moved to point to different images, but digests are fixed.
|
||||
|
||||
Image tags consist of lowercase and uppercase letters, digits, underscores (`_`),
|
||||
periods (`.`), and dashes (`-`). It can be up to 128 characters long. And must follow the
|
||||
next regex pattern: `[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}`
|
||||
You can read more about and find validation regex in the
|
||||
periods (`.`), and dashes (`-`). A tag can be up to 128 characters long, and must
|
||||
conform to the following regex pattern: `[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}`.
|
||||
You can read more about it and find the validation regex in the
|
||||
[OCI Distribution Specification](https://github.com/opencontainers/distribution-spec/blob/master/spec.md#workflow-categories).
|
||||
If you don't specify a tag, Kubernetes assumes you mean the tag `latest`.
|
||||
|
||||
Image digests consists of a hash algorithm (such as `sha256`) and a hash value. For example:
|
||||
`sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07`
|
||||
You can find more information about digests format in the
|
||||
`sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07`.
|
||||
You can find more information about the digest format in the
|
||||
[OCI Image Specification](https://github.com/opencontainers/image-spec/blob/master/descriptor.md#digests).
|
||||
|
||||
Some image name examples that Kubernetes can use are:
|
||||
|
||||
- `busybox` - Image name only, no tag or digest. Kubernetes will use Docker public registry and latest tag. (Same as `docker.io/library/busybox:latest`)
|
||||
- `busybox:1.32.0` - Image name with tag. Kubernetes will use Docker public registry. (Same as `docker.io/library/busybox:1.32.0`)
|
||||
- `registry.k8s.io/pause:latest` - Image name with a custom registry and latest tag.
|
||||
- `registry.k8s.io/pause:3.5` - Image name with a custom registry and non-latest tag.
|
||||
- `registry.k8s.io/pause@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` - Image name with digest.
|
||||
- `registry.k8s.io/pause:3.5@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` - Image name with tag and digest. Only digest will be used for pulling.
|
||||
- `busybox` — Image name only, no tag or digest. Kubernetes will use the Docker
|
||||
public registry and latest tag. Equivalent to `docker.io/library/busybox:latest`.
|
||||
- `busybox:1.32.0` — Image name with tag. Kubernetes will use the Docker
|
||||
public registry. Equivalent to `docker.io/library/busybox:1.32.0`.
|
||||
- `registry.k8s.io/pause:latest` — Image name with a custom registry and latest tag.
|
||||
- `registry.k8s.io/pause:3.5` — Image name with a custom registry and non-latest tag.
|
||||
- `registry.k8s.io/pause@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` —
|
||||
Image name with digest.
|
||||
- `registry.k8s.io/pause:3.5@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` —
|
||||
Image name with tag and digest. Only the digest will be used for pulling.
|
||||
|
||||
## Updating images
|
||||
|
||||
When you first create a {{< glossary_tooltip text="Deployment" term_id="deployment" >}},
|
||||
{{< glossary_tooltip text="StatefulSet" term_id="statefulset" >}}, Pod, or other
|
||||
object that includes a Pod template, then by default the pull policy of all
|
||||
containers in that pod will be set to `IfNotPresent` if it is not explicitly
|
||||
specified. This policy causes the
|
||||
object that includes a PodTemplate, and a pull policy was not explicitly specified,
|
||||
then by default the pull policy of all containers in that Pod will be set to
|
||||
`IfNotPresent`. This policy causes the
|
||||
{{< glossary_tooltip text="kubelet" term_id="kubelet" >}} to skip pulling an
|
||||
image if it already exists.
|
||||
|
||||
### Image pull policy
|
||||
|
||||
The `imagePullPolicy` for a container and the tag of the image affect when the
|
||||
[kubelet](/docs/reference/command-line-tools-reference/kubelet/) attempts to pull (download) the specified image.
|
||||
The `imagePullPolicy` for a container and the tag of the image both affect _when_ the
|
||||
[kubelet](/docs/reference/command-line-tools-reference/kubelet/) attempts to pull
|
||||
(download) the specified image.
|
||||
|
||||
Here's a list of the values you can set for `imagePullPolicy` and the effects
|
||||
these values have:
|
||||
|
@ -119,12 +124,12 @@ When using image tags, if the image registry were to change the code that the ta
|
|||
represents, you might end up with a mix of Pods running the old and new code. An image digest
|
||||
uniquely identifies a specific version of the image, so Kubernetes runs the same code every time
|
||||
it starts a container with that image name and digest specified. Specifying an image by digest
|
||||
fixes the code that you run so that a change at the registry cannot lead to that mix of versions.
|
||||
pins the code that you run so that a change at the registry cannot lead to that mix of versions.
|
||||
|
||||
There are third-party [admission controllers](/docs/reference/access-authn-authz/admission-controllers/)
|
||||
that mutate Pods (and pod templates) when they are created, so that the
|
||||
that mutate Pods (and PodTemplates) when they are created, so that the
|
||||
running workload is defined based on an image digest rather than a tag.
|
||||
That might be useful if you want to make sure that all your workload is
|
||||
That might be useful if you want to make sure that your entire workload is
|
||||
running the same code no matter what tag changes happen at the registry.
|
||||
|
||||
#### Default image pull policy {#imagepullpolicy-defaulting}
|
||||
|
@ -135,11 +140,11 @@ When you (or a controller) submit a new Pod to the API server, your cluster sets
|
|||
- if you omit the `imagePullPolicy` field, and you specify the digest for the
|
||||
container image, the `imagePullPolicy` is automatically set to `IfNotPresent`.
|
||||
- if you omit the `imagePullPolicy` field, and the tag for the container image is
|
||||
`:latest`, `imagePullPolicy` is automatically set to `Always`;
|
||||
`:latest`, `imagePullPolicy` is automatically set to `Always`.
|
||||
- if you omit the `imagePullPolicy` field, and you don't specify the tag for the
|
||||
container image, `imagePullPolicy` is automatically set to `Always`;
|
||||
- if you omit the `imagePullPolicy` field, and you specify the tag for the
|
||||
container image that isn't `:latest`, the `imagePullPolicy` is automatically set to
|
||||
container image, `imagePullPolicy` is automatically set to `Always`.
|
||||
- if you omit the `imagePullPolicy` field, and you specify a tag for the container
|
||||
image that isn't `:latest`, the `imagePullPolicy` is automatically set to
|
||||
`IfNotPresent`.
|
||||
|
||||
{{< note >}}
|
||||
|
@ -184,16 +189,18 @@ which is 300 seconds (5 minutes).
|
|||
Kubernetes includes alpha support for performing image pulls based on the RuntimeClass of a Pod.
|
||||
|
||||
If you enable the `RuntimeClassInImageCriApi` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/),
|
||||
the kubelet references container images by a tuple of (image name, runtime handler) rather than just the
|
||||
image name or digest. Your {{< glossary_tooltip text="container runtime" term_id="container-runtime" >}}
|
||||
the kubelet references container images by a tuple of image name and runtime handler
|
||||
rather than just the image name or digest. Your
|
||||
{{< glossary_tooltip text="container runtime" term_id="container-runtime" >}}
|
||||
may adapt its behavior based on the selected runtime handler.
|
||||
Pulling images based on runtime class will be helpful for VM based containers like windows hyperV containers.
|
||||
Pulling images based on runtime class is useful for VM-based containers, such as
|
||||
Windows Hyper-V containers.
|
||||
|
||||
## Serial and parallel image pulls
|
||||
|
||||
By default, kubelet pulls images serially. In other words, kubelet sends only
|
||||
one image pull request to the image service at a time. Other image pull requests
|
||||
have to wait until the one being processed is complete.
|
||||
By default, the kubelet pulls images serially. In other words, the kubelet sends
|
||||
only one image pull request to the image service at a time. Other image pull
|
||||
requests have to wait until the one being processed is complete.
|
||||
|
||||
Nodes make image pull decisions in isolation. Even when you use serialized image
|
||||
pulls, two different nodes can pull the same image in parallel.
|
||||
|
@ -203,72 +210,91 @@ If you would like to enable parallel image pulls, you can set the field
|
|||
With `serializeImagePulls` set to false, image pull requests will be sent to the image service immediately,
|
||||
and multiple images will be pulled at the same time.
|
||||
|
||||
When enabling parallel image pulls, please make sure the image service of your
|
||||
container runtime can handle parallel image pulls.
|
||||
When enabling parallel image pulls, ensure that the image service of your container
|
||||
runtime can handle parallel image pulls.
|
||||
|
||||
The kubelet never pulls multiple images in parallel on behalf of one Pod. For example,
|
||||
if you have a Pod that has an init container and an application container, the image
|
||||
pulls for the two containers will not be parallelized. However, if you have two
|
||||
Pods that use different images, the kubelet pulls the images in parallel on
|
||||
behalf of the two different Pods, when parallel image pulls is enabled.
|
||||
Pods that use different images, and the parallel image pull feature is enabled,
|
||||
the kubelet will pull the images in parallel on behalf of the two different Pods.
|
||||
|
||||
### Maximum parallel image pulls
|
||||
|
||||
{{< feature-state for_k8s_version="v1.32" state="beta" >}}
|
||||
|
||||
When `serializeImagePulls` is set to false, the kubelet defaults to no limit on the
|
||||
maximum number of images being pulled at the same time. If you would like to
|
||||
When `serializeImagePulls` is set to false, the kubelet defaults to no limit on
|
||||
the maximum number of images being pulled at the same time. If you would like to
|
||||
limit the number of parallel image pulls, you can set the field `maxParallelImagePulls`
|
||||
in kubelet configuration. With `maxParallelImagePulls` set to _n_, only _n_ images
|
||||
can be pulled at the same time, and any image pull beyond _n_ will have to wait
|
||||
until at least one ongoing image pull is complete.
|
||||
in the kubelet configuration. With `maxParallelImagePulls` set to _n_, only _n_
|
||||
images can be pulled at the same time, and any image pull beyond _n_ will have to
|
||||
wait until at least one ongoing image pull is complete.
|
||||
|
||||
Limiting the number parallel image pulls would prevent image pulling from consuming
|
||||
Limiting the number of parallel image pulls prevents image pulling from consuming
|
||||
too much network bandwidth or disk I/O, when parallel image pulling is enabled.
|
||||
|
||||
You can set `maxParallelImagePulls` to a positive number that is greater than or
|
||||
equal to 1. If you set `maxParallelImagePulls` to be greater than or equal to 2, you
|
||||
must set the `serializeImagePulls` to false. The kubelet will fail to start with invalid
|
||||
`maxParallelImagePulls` settings.
|
||||
equal to 1. If you set `maxParallelImagePulls` to be greater than or equal to 2,
|
||||
you must set `serializeImagePulls` to false. The kubelet will fail to start
|
||||
with an invalid `maxParallelImagePulls` setting.
|
||||
|
||||
## Multi-architecture images with image indexes
|
||||
|
||||
As well as providing binary images, a container registry can also serve a
|
||||
[container image index](https://github.com/opencontainers/image-spec/blob/master/image-index.md).
|
||||
An image index can point to multiple [image manifests](https://github.com/opencontainers/image-spec/blob/master/manifest.md)
|
||||
for architecture-specific versions of a container. The idea is that you can have a name for an image
|
||||
(for example: `pause`, `example/mycontainer`, `kube-apiserver`) and allow different systems to
|
||||
fetch the right binary image for the machine architecture they are using.
|
||||
for architecture-specific versions of a container. The idea is that you can have
|
||||
a name for an image (for example: `pause`, `example/mycontainer`, `kube-apiserver`)
|
||||
and allow different systems to fetch the right binary image for the machine
|
||||
architecture they are using.
|
||||
|
||||
Kubernetes itself typically names container images with a suffix `-$(ARCH)`. For backward
|
||||
compatibility, please generate the older images with suffixes. The idea is to generate say `pause`
|
||||
image which has the manifest for all the arch(es) and say `pause-amd64` which is backwards
|
||||
compatible for older configurations or YAML files which may have hard coded the images with
|
||||
suffixes.
|
||||
The Kubernetes project typically creates container images for its releases with
|
||||
names that include the suffix `-$(ARCH)`. For backward compatibility, generate
|
||||
older images with suffixes. For instance, an image named as `pause` would be a
|
||||
multi-architecture image containing manifests for all supported architectures,
|
||||
while `pause-amd64` would be a backward-compatible version for older configurations,
|
||||
or for YAML files with hardcoded image names containing suffixes.
|
||||
|
||||
## Using a private registry
|
||||
|
||||
Private registries may require keys to read images from them.
|
||||
Private registries may require authentication to be able to discover and/or pull
|
||||
images from them.
|
||||
Credentials can be provided in several ways:
|
||||
|
||||
- Configuring Nodes to Authenticate to a Private Registry
|
||||
- all pods can read any configured private registries
|
||||
- requires node configuration by cluster administrator
|
||||
- Kubelet Credential Provider to dynamically fetch credentials for private registries
|
||||
- kubelet can be configured to use credential provider exec plugin
|
||||
for the respective private registry.
|
||||
- Pre-pulled Images
|
||||
- all pods can use any images cached on a node
|
||||
- requires root access to all nodes to set up
|
||||
- Specifying ImagePullSecrets on a Pod
|
||||
- only pods which provide their own keys can access the private registry
|
||||
- [Specifying `imagePullSecrets` when you define a Pod](#specifying-imagepullsecrets-on-a-pod)
|
||||
|
||||
Only Pods which provide their own keys can access the private registry.
|
||||
|
||||
- [Configuring Nodes to Authenticate to a Private Registry](#configuring-nodes-to-authenticate-to-a-private-registry)
|
||||
- All Pods can read any configured private registries.
|
||||
- Requires node configuration by cluster administrator.
|
||||
- Using a _kubelet credential provider_ plugin to [dynamically fetch credentials for private registries](#kubelet-credential-provider)
|
||||
|
||||
The kubelet can be configured to use credential provider exec plugin for the
|
||||
respective private registry.
|
||||
|
||||
- [Pre-pulled Images](#pre-pulled-images)
|
||||
- All Pods can use any images cached on a node.
|
||||
- Requires root access to all nodes to set up.
|
||||
- Vendor-specific or local extensions
|
||||
- if you're using a custom node configuration, you (or your cloud
|
||||
provider) can implement your mechanism for authenticating the node
|
||||
to the container registry.
|
||||
|
||||
If you're using a custom node configuration, you (or your cloud provider) can
|
||||
implement your mechanism for authenticating the node to the container registry.
|
||||
|
||||
These options are explained in more detail below.
|
||||
|
||||
### Specifying `imagePullSecrets` on a Pod
|
||||
|
||||
{{< note >}}
|
||||
This is the recommended approach to run containers based on images
|
||||
in private registries.
|
||||
{{< /note >}}
|
||||
|
||||
Kubernetes supports specifying container image registry keys on a Pod.
|
||||
All `imagePullSecrets` must be Secrets that exist in the same
|
||||
{{< glossary_tooltip term_id="namespace" >}} as the
|
||||
Pod. These Secrets must be of type `kubernetes.io/dockercfg` or `kubernetes.io/dockerconfigjson`.
|
||||
|
||||
### Configuring nodes to authenticate to a private registry
|
||||
|
||||
Specific instructions for setting credentials depends on the container runtime and registry you
|
||||
|
@ -280,13 +306,17 @@ task. That example uses a private registry in Docker Hub.
|
|||
|
||||
### Kubelet credential provider for authenticated image pulls {#kubelet-credential-provider}
|
||||
|
||||
{{< note >}}
|
||||
This approach is especially suitable when kubelet needs to fetch registry credentials dynamically.
|
||||
Most commonly used for registries provided by cloud providers where auth tokens are short-lived.
|
||||
{{< /note >}}
|
||||
You can configure the kubelet to invoke a plugin binary to dynamically fetch
|
||||
registry credentials for a container image. This is the most robust and versatile
|
||||
way to fetch credentials for private registries, but also requires kubelet-level
|
||||
configuration to enable.
|
||||
|
||||
You can configure the kubelet to invoke a plugin binary to dynamically fetch registry credentials for a container image.
|
||||
This is the most robust and versatile way to fetch credentials for private registries, but also requires kubelet-level configuration to enable.
|
||||
This technique can be especially useful for running {{< glossary_tooltip term_id="static-pod" text="static Pods" >}}
|
||||
that require container images hosted in a private registry.
|
||||
Using a {{< glossary_tooltip term_id="service-account" >}} or a
|
||||
{{< glossary_tooltip term_id="secret" >}} to provide private registry credentials
|
||||
is not possible in the specification of a static Pod, because it _cannot_
|
||||
have references to other API resources in its specification.
|
||||
|
||||
See [Configure a kubelet image credential provider](/docs/tasks/administer-cluster/kubelet-credential-provider/) for more details.
|
||||
|
||||
|
@ -299,55 +329,57 @@ prefix-matched paths. The only limitation is that glob patterns (`*`) have to
|
|||
include the dot (`.`) for each subdomain. The amount of matched subdomains has
|
||||
to be equal to the amount of glob patterns (`*.`), for example:
|
||||
|
||||
- `*.kubernetes.io` will *not* match `kubernetes.io`, but `abc.kubernetes.io`
|
||||
- `*.*.kubernetes.io` will *not* match `abc.kubernetes.io`, but `abc.def.kubernetes.io`
|
||||
- `prefix.*.io` will match `prefix.kubernetes.io`
|
||||
- `*-good.kubernetes.io` will match `prefix-good.kubernetes.io`
|
||||
- `*.kubernetes.io` will *not* match `kubernetes.io`, but will match
|
||||
`abc.kubernetes.io`.
|
||||
- `*.*.kubernetes.io` will *not* match `abc.kubernetes.io`, but will match
|
||||
`abc.def.kubernetes.io`.
|
||||
- `prefix.*.io` will match `prefix.kubernetes.io`.
|
||||
- `*-good.kubernetes.io` will match `prefix-good.kubernetes.io`.
|
||||
|
||||
This means that a `config.json` like this is valid:
|
||||
|
||||
```json
|
||||
{
|
||||
"auths": {
|
||||
"my-registry.io/images": { "auth": "…" },
|
||||
"*.my-registry.io/images": { "auth": "…" }
|
||||
"my-registry.example/images": { "auth": "…" },
|
||||
"*.my-registry.example/images": { "auth": "…" }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Image pull operations would now pass the credentials to the CRI container
|
||||
runtime for every valid pattern. For example the following container image names
|
||||
would match successfully:
|
||||
Image pull operations pass the credentials to the CRI container runtime for every
|
||||
valid pattern. For example, the following container image names would match
|
||||
successfully:
|
||||
|
||||
- `my-registry.io/images`
|
||||
- `my-registry.io/images/my-image`
|
||||
- `my-registry.io/images/another-image`
|
||||
- `sub.my-registry.io/images/my-image`
|
||||
- `my-registry.example/images`
|
||||
- `my-registry.example/images/my-image`
|
||||
- `my-registry.example/images/another-image`
|
||||
- `sub.my-registry.example/images/my-image`
|
||||
|
||||
But not:
|
||||
However, these container image names would *not* match:
|
||||
|
||||
- `a.sub.my-registry.io/images/my-image`
|
||||
- `a.b.sub.my-registry.io/images/my-image`
|
||||
- `a.sub.my-registry.example/images/my-image`
|
||||
- `a.b.sub.my-registry.example/images/my-image`
|
||||
|
||||
The kubelet performs image pulls sequentially for every found credential. This
|
||||
means, that multiple entries in `config.json` for different paths are possible, too:
|
||||
means that multiple entries in `config.json` for different paths are possible, too:
|
||||
|
||||
```json
|
||||
{
|
||||
"auths": {
|
||||
"my-registry.io/images": {
|
||||
"my-registry.example/images": {
|
||||
"auth": "…"
|
||||
},
|
||||
"my-registry.io/images/subpath": {
|
||||
"my-registry.example/images/subpath": {
|
||||
"auth": "…"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If now a container specifies an image `my-registry.io/images/subpath/my-image`
|
||||
to be pulled, then the kubelet will try to download them from both
|
||||
authentication sources if one of them fails.
|
||||
If now a container specifies an image `my-registry.example/images/subpath/my-image`
|
||||
to be pulled, then the kubelet will try to download it using both authentication
|
||||
sources if one of them fails.
|
||||
|
||||
### Pre-pulled images
|
||||
|
||||
|
@ -364,40 +396,34 @@ then a local image is used (preferentially or exclusively, respectively).
|
|||
If you want to rely on pre-pulled images as a substitute for registry authentication,
|
||||
you must ensure all nodes in the cluster have the same pre-pulled images.
|
||||
|
||||
This can be used to preload certain images for speed or as an alternative to authenticating to a
|
||||
private registry.
|
||||
This can be used to preload certain images for speed or as an alternative to
|
||||
authenticating to a private registry.
|
||||
|
||||
Similar to the usage of the [kubelet credential provider](#kubelet-credential-provider),
|
||||
pre-pulled images are also suitable for launching
|
||||
{{< glossary_tooltip text="static Pods" term_id="static-pod" >}} that depend
|
||||
on images hosted in a private registry.
|
||||
|
||||
{{< note >}}
|
||||
{{< feature-state feature_gate_name="KubeletEnsureSecretPulledImages" >}}
|
||||
Access to pre-pulled images may be authorized according to [image pull credential verification](#ensureimagepullcredentialverification)
|
||||
Access to pre-pulled images may be authorized according to [image pull credential verification](#ensureimagepullcredentialverification).
|
||||
{{< /note >}}
|
||||
|
||||
### Specifying imagePullSecrets on a Pod
|
||||
|
||||
{{< note >}}
|
||||
This is the recommended approach to run containers based on images
|
||||
in private registries.
|
||||
{{< /note >}}
|
||||
|
||||
Kubernetes supports specifying container image registry keys on a Pod.
|
||||
`imagePullSecrets` must all be in the same namespace as the Pod. The referenced
|
||||
Secrets must be of type `kubernetes.io/dockercfg` or `kubernetes.io/dockerconfigjson`.
|
||||
|
||||
#### Ensure Image Pull Credential Verification {#ensureimagepullcredentialverification}
|
||||
#### Ensure image pull credential verification {#ensureimagepullcredentialverification}
|
||||
|
||||
{{< feature-state feature_gate_name="KubeletEnsureSecretPulledImages" >}}
|
||||
|
||||
If the `KubeletEnsureSecretPulledImages` feature gate is enabled, Kubernetes will validate
|
||||
image credentials for every image that requires credentials to be pulled,
|
||||
even if that image is already present on the node.
|
||||
This validation ensures that images in a pod request which have not been successfully pulled
|
||||
If the `KubeletEnsureSecretPulledImages` feature gate is enabled for your cluster,
|
||||
Kubernetes will validate image credentials for every image that requires credentials
|
||||
to be pulled, even if that image is already present on the node. This validation
|
||||
ensures that images in a Pod request which have not been successfully pulled
|
||||
with the provided credentials must re-pull the images from the registry.
|
||||
Additionally, image pulls that re-use the same credentials
|
||||
which previously resulted in a successful image pull will not need to re-pull from the registry
|
||||
and are instead validated locally without accessing the registry
|
||||
which previously resulted in a successful image pull will not need to re-pull from
|
||||
the registry and are instead validated locally without accessing the registry
|
||||
(provided the image is available locally).
|
||||
This is controlled by the`imagePullCredentialsVerificationPolicy` field in the
|
||||
[Kubelet configuration](/docs/reference/config-api/kubelet-config.v1beta1#ImagePullCredentialsVerificationPolicy).
|
||||
[Kubelet configuration](/docs/reference/config-api/kubelet-config.v1beta1/#kubelet-config-k8s-io-v1beta1-ImagePullCredentialsVerificationPolicy).
|
||||
|
||||
This configuration controls when image pull credentials must be verified if the
|
||||
image is already present on the node:
|
||||
|
@ -406,13 +432,13 @@ image is already present on the node:
|
|||
If the image is present locally, image pull credentials are not verified.
|
||||
* `NeverVerifyPreloadedImages`: Images pulled outside the kubelet are not verified,
|
||||
but all other images will have their credentials verified. This is the default behavior.
|
||||
* `NeverVerifyAllowListedImages`: Images pulled outside the kubelet and mentioned within the
|
||||
* `NeverVerifyAllowListedImages`: Images pulled outside the kubelet and mentioned within the
|
||||
`preloadedImagesVerificationAllowlist` specified in the kubelet config are not verified.
|
||||
* `AlwaysVerify`: All images will have their credentials verified
|
||||
before they can be used.
|
||||
|
||||
This verification applies to [pre-pulled images](#pre-pulled-images),
|
||||
images pulled using node-wide secrets, and images pulled using pod-level secrets.
|
||||
images pulled using node-wide secrets, and images pulled using Pod-level secrets.
|
||||
|
||||
{{< note >}}
|
||||
In the case of credential rotation, the credentials previously used to pull the image
|
||||
|
@ -424,19 +450,19 @@ will require the image to be re-pulled from the registry.
|
|||
|
||||
You need to know the username, registry password and client email address for authenticating
|
||||
to the registry, as well as its hostname.
|
||||
Run the following command, substituting the appropriate uppercase values:
|
||||
Run the following command, substituting placeholders with the appropriate values:
|
||||
|
||||
```shell
|
||||
kubectl create secret docker-registry <name> \
|
||||
--docker-server=DOCKER_REGISTRY_SERVER \
|
||||
--docker-username=DOCKER_USER \
|
||||
--docker-password=DOCKER_PASSWORD \
|
||||
--docker-email=DOCKER_EMAIL
|
||||
--docker-server=<docker-registry-server> \
|
||||
--docker-username=<docker-user> \
|
||||
--docker-password=<docker-password> \
|
||||
--docker-email=<docker-email>
|
||||
```
|
||||
|
||||
If you already have a Docker credentials file then, rather than using the above
|
||||
command, you can import the credentials file as a Kubernetes
|
||||
{{< glossary_tooltip text="Secrets" term_id="secret" >}}.
|
||||
{{< glossary_tooltip text="Secret" term_id="secret" >}}.
|
||||
[Create a Secret based on existing Docker credentials](/docs/tasks/configure-pod-container/pull-image-private-registry/#registry-secret-existing-credentials)
|
||||
explains how to set this up.
|
||||
|
||||
|
@ -449,11 +475,11 @@ Pods can only reference image pull secrets in their own namespace,
|
|||
so this process needs to be done one time per namespace.
|
||||
{{< /note >}}
|
||||
|
||||
#### Referring to an imagePullSecrets on a Pod
|
||||
#### Referring to `imagePullSecrets` on a Pod
|
||||
|
||||
Now, you can create pods which reference that secret by adding an `imagePullSecrets`
|
||||
Now, you can create pods which reference that secret by adding the `imagePullSecrets`
|
||||
section to a Pod definition. Each item in the `imagePullSecrets` array can only
|
||||
reference a Secret in the same namespace.
|
||||
reference one Secret in the same namespace.
|
||||
|
||||
For example:
|
||||
|
||||
|
@ -478,15 +504,14 @@ resources:
|
|||
EOF
|
||||
```
|
||||
|
||||
This needs to be done for each pod that is using a private registry.
|
||||
This needs to be done for each Pod that is using a private registry.
|
||||
|
||||
However, setting of this field can be automated by setting the imagePullSecrets
|
||||
in a [ServiceAccount](/docs/tasks/configure-pod-container/configure-service-account/) resource.
|
||||
|
||||
Check [Add ImagePullSecrets to a Service Account](/docs/tasks/configure-pod-container/configure-service-account/#add-imagepullsecrets-to-a-service-account)
|
||||
However, you can automate this process by specifying the `imagePullSecrets` section
|
||||
in a [ServiceAccount](/docs/tasks/configure-pod-container/configure-service-account/)
|
||||
resource. See [Add ImagePullSecrets to a Service Account](/docs/tasks/configure-pod-container/configure-service-account/#add-imagepullsecrets-to-a-service-account)
|
||||
for detailed instructions.
|
||||
|
||||
You can use this in conjunction with a per-node `.docker/config.json`. The credentials
|
||||
You can use this in conjunction with a per-node `.docker/config.json`. The credentials
|
||||
will be merged.
|
||||
|
||||
## Use cases
|
||||
|
@ -502,7 +527,7 @@ common use cases and suggested solutions.
|
|||
1. Cluster running some proprietary images which should be hidden to those outside the company, but
|
||||
visible to all cluster users.
|
||||
- Use a hosted private registry
|
||||
- Manual configuration may be required on the nodes that need to access to private registry
|
||||
- Manual configuration may be required on the nodes that need to access to private registry.
|
||||
- Or, run an internal private registry behind your firewall with open read access.
|
||||
- No Kubernetes configuration is required.
|
||||
- Use a hosted container image registry service that controls image access
|
||||
|
@ -511,33 +536,34 @@ common use cases and suggested solutions.
|
|||
1. Cluster with proprietary images, a few of which require stricter access control.
|
||||
- Ensure [AlwaysPullImages admission controller](/docs/reference/access-authn-authz/admission-controllers/#alwayspullimages)
|
||||
is active. Otherwise, all Pods potentially have access to all images.
|
||||
- Move sensitive data into a "Secret" resource, instead of packaging it in an image.
|
||||
- Move sensitive data into a Secret resource, instead of packaging it in an image.
|
||||
1. A multi-tenant cluster where each tenant needs own private registry.
|
||||
- Ensure [AlwaysPullImages admission controller](/docs/reference/access-authn-authz/admission-controllers/#alwayspullimages)
|
||||
is active. Otherwise, all Pods of all tenants potentially have access to all images.
|
||||
- Run a private registry with authorization required.
|
||||
- Generate registry credential for each tenant, put into secret, and populate secret to each
|
||||
tenant namespace.
|
||||
- The tenant adds that secret to imagePullSecrets of each namespace.
|
||||
- Generate registry credentials for each tenant, store into a Secret, and propagate
|
||||
the Secret to every tenant namespace.
|
||||
- The tenant then adds that Secret to `imagePullSecrets` of each namespace.
|
||||
|
||||
If you need access to multiple registries, you can create one secret for each registry.
|
||||
If you need access to multiple registries, you can create one Secret per registry.
|
||||
|
||||
## Legacy built-in kubelet credential provider
|
||||
|
||||
In older versions of Kubernetes, the kubelet had a direct integration with cloud provider credentials.
|
||||
This gave it the ability to dynamically fetch credentials for image registries.
|
||||
In older versions of Kubernetes, the kubelet had a direct integration with cloud
|
||||
provider credentials. This provided the ability to dynamically fetch credentials
|
||||
for image registries.
|
||||
|
||||
There were three built-in implementations of the kubelet credential provider integration:
|
||||
ACR (Azure Container Registry), ECR (Elastic Container Registry), and GCR (Google Container Registry).
|
||||
There were three built-in implementations of the kubelet credential provider
|
||||
integration: ACR (Azure Container Registry), ECR (Elastic Container Registry),
|
||||
and GCR (Google Container Registry).
|
||||
|
||||
For more information on the legacy mechanism, read the documentation for the version of Kubernetes that you
|
||||
are using. Kubernetes v1.26 through to v{{< skew latestVersion >}} do not include the legacy mechanism, so
|
||||
you would need to either:
|
||||
- configure a kubelet image credential provider on each node
|
||||
- specify image pull credentials using `imagePullSecrets` and at least one Secret
|
||||
Starting with version 1.26 of Kubernetes, the legacy mechanism has been removed,
|
||||
so you would need to either:
|
||||
- configure a kubelet image credential provider on each node; or
|
||||
- specify image pull credentials using `imagePullSecrets` and at least one Secret.
|
||||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
* Read the [OCI Image Manifest Specification](https://github.com/opencontainers/image-spec/blob/master/manifest.md).
|
||||
* Read the [OCI Image Manifest Specification](https://github.com/opencontainers/image-spec/blob/main/manifest.md).
|
||||
* Learn about [container image garbage collection](/docs/concepts/architecture/garbage-collection/#container-image-garbage-collection).
|
||||
* Learn more about [pulling an Image from a Private Registry](/docs/tasks/configure-pod-container/pull-image-private-registry).
|
||||
|
|
|
@ -30,10 +30,10 @@ Here's a brief overview of the main components:
|
|||
Manage the overall state of the cluster:
|
||||
|
||||
[kube-apiserver](/docs/concepts/architecture/#kube-apiserver)
|
||||
: The core component server that exposes the Kubernetes HTTP API
|
||||
: The core component server that exposes the Kubernetes HTTP API.
|
||||
|
||||
[etcd](/docs/concepts/architecture/#etcd)
|
||||
: Consistent and highly-available key value store for all API server data
|
||||
: Consistent and highly-available key value store for all API server data.
|
||||
|
||||
[kube-scheduler](/docs/concepts/architecture/#kube-scheduler)
|
||||
: Looks for Pods not yet bound to a node, and assigns each Pod to a suitable node.
|
||||
|
@ -68,16 +68,16 @@ run [systemd](https://systemd.io/) on a Linux node to supervise local components
|
|||
Addons extend the functionality of Kubernetes. A few important examples include:
|
||||
|
||||
[DNS](/docs/concepts/architecture/#dns)
|
||||
: For cluster-wide DNS resolution
|
||||
: For cluster-wide DNS resolution.
|
||||
|
||||
[Web UI](/docs/concepts/architecture/#web-ui-dashboard) (Dashboard)
|
||||
: For cluster management via a web interface
|
||||
: For cluster management via a web interface.
|
||||
|
||||
[Container Resource Monitoring](/docs/concepts/architecture/#container-resource-monitoring)
|
||||
: For collecting and storing container metrics
|
||||
: For collecting and storing container metrics.
|
||||
|
||||
[Cluster-level Logging](/docs/concepts/architecture/#cluster-level-logging)
|
||||
: For saving container logs to a central log store
|
||||
: For saving container logs to a central log store.
|
||||
|
||||
## Flexibility in Architecture
|
||||
|
||||
|
|
|
@ -55,6 +55,11 @@ object once it is set.
|
|||
* After the deletion is requested, you can not resurrect this object. The only way is to delete it and make a new similar object.
|
||||
{{</note>}}
|
||||
|
||||
{{<note>}}
|
||||
Custom finalizer names **must** be publicly qualified finalizer names, such as `example.com/finalizer-name`.
|
||||
Kubernetes enforces this format; the API server rejects writes to objects where the change does not use qualified finalizer names for any custom finalizer.
|
||||
{{</note>}}
|
||||
|
||||
## Owner references, labels, and finalizers {#owners-labels-finalizers}
|
||||
|
||||
Like {{<glossary_tooltip text="labels" term_id="label">}},
|
||||
|
|
|
@ -14,36 +14,57 @@ weight: 20
|
|||
When several users or teams share a cluster with a fixed number of nodes,
|
||||
there is a concern that one team could use more than its fair share of resources.
|
||||
|
||||
Resource quotas are a tool for administrators to address this concern.
|
||||
_Resource quotas_ are a tool for administrators to address this concern.
|
||||
|
||||
A resource quota, defined by a ResourceQuota object, provides constraints that limit
|
||||
aggregate resource consumption per {{< glossary_tooltip text="namespace" term_id="namespace" >}}. A ResourceQuota can also
|
||||
limit the [quantity of objects that can be created in a namespace](#object-count-quota) by API kind, as well as the total
|
||||
amount of {{< glossary_tooltip text="infrastructure resources" term_id="infrastructure-resource" >}} that may be consumed by
|
||||
API objects found in that namespace.
|
||||
|
||||
{{< caution >}}
|
||||
Neither contention nor changes to quota will affect already created resources.
|
||||
{{< /caution >}}
|
||||
|
||||
<!-- body -->
|
||||
|
||||
A resource quota, defined by a `ResourceQuota` object, provides constraints that limit
|
||||
aggregate resource consumption per namespace. It can limit the quantity of objects that can
|
||||
be created in a namespace by type, as well as the total amount of compute resources that may
|
||||
be consumed by resources in that namespace.
|
||||
## How Kubernetes ResourceQuotas work
|
||||
|
||||
Resource quotas work like this:
|
||||
ResourceQuotas work like this:
|
||||
|
||||
- Different teams work in different namespaces. This can be enforced with
|
||||
[RBAC](/docs/reference/access-authn-authz/rbac/).
|
||||
- Different teams work in different namespaces. This separation can be enforced with
|
||||
[RBAC](/docs/reference/access-authn-authz/rbac/) or any other [authorization](/docs/reference/access-authn-authz/authorization/)
|
||||
mechanism.
|
||||
|
||||
- The administrator creates one ResourceQuota for each namespace.
|
||||
- A cluster administrator creates at least one ResourceQuota for each namespace.
|
||||
- To make sure the enforcement stays enforced, the cluster administrator should also restrict access to delete or update
|
||||
that ResourceQuota; for example, by defining a [ValidatingAdmissionPolicy](/docs/reference/access-authn-authz/validating-admission-policy/).
|
||||
|
||||
- Users create resources (pods, services, etc.) in the namespace, and the quota system
|
||||
tracks usage to ensure it does not exceed hard resource limits defined in a ResourceQuota.
|
||||
|
||||
- If creating or updating a resource violates a quota constraint, the request will fail with HTTP
|
||||
status code `403 FORBIDDEN` with a message explaining the constraint that would have been violated.
|
||||
You can apply a [scope](#quota-scopes) to a ResourceQuota to limit where it applies,
|
||||
|
||||
- If quotas are enabled in a namespace for compute resources like `cpu` and `memory`, users must specify
|
||||
requests or limits for those values; otherwise, the quota system may reject pod creation. Hint: Use
|
||||
the `LimitRanger` admission controller to force defaults for pods that make no compute resource requirements.
|
||||
- If creating or updating a resource violates a quota constraint, the control plane rejects that request with HTTP
|
||||
status code `403 Forbidden`. The error includes a message explaining the constraint that would have been violated.
|
||||
|
||||
See the [walkthrough](/docs/tasks/administer-cluster/manage-resources/quota-memory-cpu-namespace/)
|
||||
for an example of how to avoid this problem.
|
||||
- If quotas are enabled in a namespace for {{< glossary_tooltip text="resource" term_id="infrastructure-resource" >}}
|
||||
such as `cpu` and `memory`, users must specify requests or limits for those values when they define a Pod; otherwise,
|
||||
the quota system may reject pod creation.
|
||||
|
||||
The resource quota [walkthrough](/docs/tasks/administer-cluster/manage-resources/quota-memory-cpu-namespace/)
|
||||
shows an example of how to avoid this problem.
|
||||
|
||||
{{< note >}}
|
||||
* You can define a [LimitRange](/docs/concepts/policy/limit-range/)
|
||||
to force defaults on pods that make no compute resource requirements (so that users don't have to remember to do that).
|
||||
{{< /note >}}
|
||||
|
||||
You often do not create Pods directly; for example, you more usually create a [workload management](/docs/concepts/workloads/controllers/)
|
||||
object such as a {{< glossary_tooltip term_id="deployment" >}}. If you create a Deployment that tries to use more
|
||||
resources than are available, the creation of the Deployment (or other workload management object) **succeeds**, but
|
||||
the Deployment may not be able to get all of the Pods it manages to exist. In that case you can check the status of
|
||||
the Deployment, for example with `kubectl describe`, to see what has happened.
|
||||
|
||||
- For `cpu` and `memory` resources, ResourceQuotas enforce that **every**
|
||||
(new) pod in that namespace sets a limit for that resource.
|
||||
|
@ -59,8 +80,6 @@ Resource quotas work like this:
|
|||
You can use a [LimitRange](/docs/concepts/policy/limit-range/) to automatically set
|
||||
a default request for these resources.
|
||||
|
||||
{{< /note >}}
|
||||
|
||||
The name of a ResourceQuota object must be a valid
|
||||
[DNS subdomain name](/docs/concepts/overview/working-with-objects/names#dns-subdomain-names).
|
||||
|
||||
|
@ -74,7 +93,6 @@ Examples of policies that could be created using namespaces and quotas are:
|
|||
In the case where the total capacity of the cluster is less than the sum of the quotas of the namespaces,
|
||||
there may be contention for resources. This is handled on a first-come-first-served basis.
|
||||
|
||||
Neither contention nor changes to quota will affect already created resources.
|
||||
|
||||
## Enabling Resource Quota
|
||||
|
||||
|
@ -916,8 +934,9 @@ and it is to be created in a namespace other than `kube-system`.
|
|||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
- See [ResourceQuota design document](https://git.k8s.io/design-proposals-archive/resource-management/admission_control_resource_quota.md)
|
||||
for more information.
|
||||
- See a [detailed example for how to use resource quota](/docs/tasks/administer-cluster/quota-api-object/).
|
||||
- Read [Quota support for priority class design document](https://git.k8s.io/design-proposals-archive/scheduling/pod-priority-resourcequota.md).
|
||||
- See [LimitedResources](https://github.com/kubernetes/kubernetes/pull/36765).
|
||||
- Read the ResourceQuota [API reference](/docs/reference/kubernetes-api/policy-resources/resource-quota-v1/)
|
||||
- Learn about [LimitRanges](/docs/concepts/policy/limit-range/)
|
||||
- You can read the historical [ResourceQuota design document](https://git.k8s.io/design-proposals-archive/resource-management/admission_control_resource_quota.md)
|
||||
for more information.
|
||||
- You can also read the [Quota support for priority class design document](https://git.k8s.io/design-proposals-archive/scheduling/pod-priority-resourcequota.md).
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
---
|
||||
reviewers:
|
||||
- davidopp
|
||||
- dom4ha
|
||||
- kevin-wangzefeng
|
||||
- alculquicondor
|
||||
- macsko
|
||||
- sanposhiho
|
||||
title: Assigning Pods to Nodes
|
||||
content_type: concept
|
||||
weight: 20
|
||||
|
@ -81,7 +83,7 @@ information.
|
|||
## Affinity and anti-affinity
|
||||
|
||||
`nodeSelector` is the simplest way to constrain Pods to nodes with specific
|
||||
labels. Affinity and anti-affinity expands the types of constraints you can
|
||||
labels. Affinity and anti-affinity expand the types of constraints you can
|
||||
define. Some of the benefits of affinity and anti-affinity include:
|
||||
|
||||
- The affinity/anti-affinity language is more expressive. `nodeSelector` only
|
||||
|
@ -234,10 +236,12 @@ Pods, the default Kubernetes scheduler places those Pods and honors any
|
|||
### Inter-pod affinity and anti-affinity
|
||||
|
||||
Inter-pod affinity and anti-affinity allow you to constrain which nodes your
|
||||
Pods can be scheduled on based on the labels of **Pods** already running on that
|
||||
Pods can be scheduled on based on the labels of Pods already running on that
|
||||
node, instead of the node labels.
|
||||
|
||||
Inter-pod affinity and anti-affinity rules take the form "this
|
||||
#### Types of Inter-pod Affinity and Anti-affinity
|
||||
|
||||
Inter-pod affinity and anti-affinity take the form "this
|
||||
Pod should (or, in the case of anti-affinity, should not) run in an X if that X
|
||||
is already running one or more Pods that meet rule Y", where X is a topology
|
||||
domain like node, rack, cloud provider zone or region, or similar and Y is the
|
||||
|
@ -257,16 +261,14 @@ the node label that the system uses to denote the domain. For examples, see
|
|||
Inter-pod affinity and anti-affinity require substantial amounts of
|
||||
processing which can slow down scheduling in large clusters significantly. We do
|
||||
not recommend using them in clusters larger than several hundred nodes.
|
||||
{{< /note >}}
|
||||
{{</note>}}
|
||||
|
||||
{{< note >}}
|
||||
Pod anti-affinity requires nodes to be consistently labeled, in other words,
|
||||
every node in the cluster must have an appropriate label matching `topologyKey`.
|
||||
If some or all nodes are missing the specified `topologyKey` label, it can lead
|
||||
to unintended behavior.
|
||||
{{< /note >}}
|
||||
|
||||
#### Types of inter-pod affinity and anti-affinity
|
||||
{{</note>}}
|
||||
|
||||
Similar to [node affinity](#node-affinity) are two types of Pod affinity and
|
||||
anti-affinity as follows:
|
||||
|
@ -285,16 +287,34 @@ To use inter-pod affinity, use the `affinity.podAffinity` field in the Pod spec.
|
|||
For inter-pod anti-affinity, use the `affinity.podAntiAffinity` field in the Pod
|
||||
spec.
|
||||
|
||||
#### Scheduling a group of pods with inter-pod affinity to themselves
|
||||
#### Scheduling Behavior
|
||||
|
||||
When scheduling a new Pod, the Kubernetes scheduler evaluates the Pod's affinity/anti-affinity rules in the context of the current cluster state:
|
||||
|
||||
1. Hard Constraints (Node Filtering):
|
||||
- `podAffinity.requiredDuringSchedulingIgnoredDuringExecution` and `podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution`:
|
||||
- The scheduler ensures the new Pod is assigned to nodes that satisfy these required affinity and anti-affinity rules based on existing Pods.
|
||||
|
||||
2. Soft Constraints (Scoring):
|
||||
- `podAffinity.preferredDuringSchedulingIgnoredDuringExecution` and `podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
|
||||
- The scheduler scores nodes based on how well they meet these preferred affinity and anti-affinity rules to optimize Pod placement.
|
||||
|
||||
3. Ignored Fields:
|
||||
- Existing Pods' `podAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
|
||||
- These preferred affinity rules are not considered during the scheduling decision for new Pods.
|
||||
- Existing Pods' `podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
|
||||
- Similarly, preferred anti-affinity rules of existing Pods are ignored during scheduling.
|
||||
|
||||
#### Scheduling a Group of Pods with Inter-pod Affinity to Themselves
|
||||
|
||||
If the current Pod being scheduled is the first in a series that have affinity to themselves,
|
||||
it is allowed to be scheduled if it passes all other affinity checks. This is determined by
|
||||
verifying that no other pod in the cluster matches the namespace and selector of this pod,
|
||||
that the pod matches its own terms, and the chosen node matches all requested topologies.
|
||||
This ensures that there will not be a deadlock even if all the pods have inter-pod affinity
|
||||
verifying that no other Pod in the cluster matches the namespace and selector of this Pod,
|
||||
that the Pod matches its own terms, and the chosen node matches all requested topologies.
|
||||
This ensures that there will not be a deadlock even if all the Pods have inter-pod affinity
|
||||
specified.
|
||||
|
||||
#### Pod affinity example {#an-example-of-a-pod-that-uses-pod-affinity}
|
||||
#### Pod Affinity Example {#an-example-of-a-pod-that-uses-pod-affinity}
|
||||
|
||||
Consider the following Pod spec:
|
||||
|
||||
|
@ -349,7 +369,7 @@ of namespaces which the `labelSelector` should match against using the
|
|||
If omitted or empty, `namespaces` defaults to the namespace of the Pod where the
|
||||
affinity/anti-affinity definition appears.
|
||||
|
||||
#### Namespace selector
|
||||
#### Namespace Selector
|
||||
|
||||
{{< feature-state for_k8s_version="v1.24" state="stable" >}}
|
||||
|
||||
|
@ -371,12 +391,12 @@ When you want to disable it, you have to disable it explicitly via the
|
|||
{{< /note >}}
|
||||
|
||||
Kubernetes includes an optional `matchLabelKeys` field for Pod affinity
|
||||
or anti-affinity. The field specifies keys for the labels that should match with the incoming Pod's labels,
|
||||
or anti-affinity. The field specifies keys for the labels that should match with the incoming Pod's labels,
|
||||
when satisfying the Pod (anti)affinity.
|
||||
|
||||
The keys are used to look up values from the pod labels; those key-value labels are combined
|
||||
The keys are used to look up values from the Pod labels; those key-value labels are combined
|
||||
(using `AND`) with the match restrictions defined using the `labelSelector` field. The combined
|
||||
filtering selects the set of existing pods that will be taken into Pod (anti)affinity calculation.
|
||||
filtering selects the set of existing Pods that will be taken into Pod (anti)affinity calculation.
|
||||
|
||||
{{< caution >}}
|
||||
It's not recommended to use `matchLabelKeys` with labels that might be updated directly on pods.
|
||||
|
@ -428,7 +448,7 @@ When you want to disable it, you have to disable it explicitly via the
|
|||
{{< /note >}}
|
||||
|
||||
Kubernetes includes an optional `mismatchLabelKeys` field for Pod affinity
|
||||
or anti-affinity. The field specifies keys for the labels that should **not** match with the incoming Pod's labels,
|
||||
or anti-affinity. The field specifies keys for the labels that should not match with the incoming Pod's labels,
|
||||
when satisfying the Pod (anti)affinity.
|
||||
|
||||
{{< caution >}}
|
||||
|
@ -452,20 +472,20 @@ spec:
|
|||
affinity:
|
||||
podAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
# ensure that pods associated with this tenant land on the correct node pool
|
||||
# ensure that Pods associated with this tenant land on the correct node pool
|
||||
- matchLabelKeys:
|
||||
- tenant
|
||||
topologyKey: node-pool
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
# ensure that pods associated with this tenant can't schedule to nodes used for another tenant
|
||||
# ensure that Pods associated with this tenant can't schedule to nodes used for another tenant
|
||||
- mismatchLabelKeys:
|
||||
- tenant # whatever the value of the "tenant" label for this Pod, prevent
|
||||
# scheduling to nodes in any pool where any Pod from a different
|
||||
# tenant is running.
|
||||
labelSelector:
|
||||
# We have to have the labelSelector which selects only Pods with the tenant label,
|
||||
# otherwise this Pod would have Pods from daemonsets as well, for example,
|
||||
# otherwise this Pod would have anti-affinity against Pods from daemonsets as well, for example,
|
||||
# which aren't supposed to have the tenant label.
|
||||
matchExpressions:
|
||||
- key: tenant
|
||||
|
@ -649,10 +669,10 @@ The following operators can only be used with `nodeAffinity`.
|
|||
| `Gt` | The field value will be parsed as an integer, and that integer is less than the integer that results from parsing the value of a label named by this selector |
|
||||
| `Lt` | The field value will be parsed as an integer, and that integer is greater than the integer that results from parsing the value of a label named by this selector |
|
||||
|
||||
|
||||
{{<note>}}
|
||||
|
||||
`Gt` and `Lt` operators will not work with non-integer values. If the given value
|
||||
doesn't parse as an integer, the pod will fail to get scheduled. Also, `Gt` and `Lt`
|
||||
doesn't parse as an integer, the Pod will fail to get scheduled. Also, `Gt` and `Lt`
|
||||
are not available for `podAffinity`.
|
||||
{{</note>}}
|
||||
|
||||
|
@ -664,4 +684,4 @@ are not available for `podAffinity`.
|
|||
- Learn about how the [topology manager](/docs/tasks/administer-cluster/topology-manager/) takes part in node-level
|
||||
resource allocation decisions.
|
||||
- Learn how to use [nodeSelector](/docs/tasks/configure-pod-container/assign-pods-nodes/).
|
||||
- Learn how to use [affinity and anti-affinity](/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/).
|
||||
- Learn how to use [affinity and anti-affinity](/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/).
|
|
@ -30,165 +30,319 @@ api_metadata:
|
|||
|
||||
{{< feature-state feature_gate_name="DynamicResourceAllocation" >}}
|
||||
|
||||
Dynamic resource allocation is an API for requesting and sharing resources
|
||||
between pods and containers inside a pod. It is a generalization of the
|
||||
persistent volumes API for generic resources. Typically those resources
|
||||
are devices like GPUs.
|
||||
|
||||
Third-party resource drivers are
|
||||
responsible for tracking and preparing resources, with allocation of
|
||||
resources handled by Kubernetes via _structured parameters_ (introduced in Kubernetes 1.30).
|
||||
Different kinds of resources support arbitrary parameters for defining requirements and
|
||||
initialization.
|
||||
|
||||
Kubernetes v1.26 through to 1.31 included an (alpha) implementation of _classic DRA_,
|
||||
which is no longer supported. This documentation, which is for Kubernetes
|
||||
v{{< skew currentVersion >}}, explains the current approach to dynamic resource
|
||||
allocation within Kubernetes.
|
||||
|
||||
## {{% heading "prerequisites" %}}
|
||||
|
||||
Kubernetes v{{< skew currentVersion >}} includes cluster-level API support for
|
||||
dynamic resource allocation, but it [needs to be enabled](#enabling-dynamic-resource-allocation)
|
||||
explicitly. You also must install a resource driver for specific resources that
|
||||
are meant to be managed using this API. If you are not running Kubernetes
|
||||
v{{< skew currentVersion>}}, check the documentation for that version of Kubernetes.
|
||||
This page describes _dynamic resource allocation (DRA)_ in Kubernetes.
|
||||
|
||||
<!-- body -->
|
||||
|
||||
## API
|
||||
## About DRA {#about-dra}
|
||||
|
||||
The `resource.k8s.io/v1beta1` and `resource.k8s.io/v1beta2`
|
||||
{{< glossary_tooltip text="API groups" term_id="api-group" >}} provide these types:
|
||||
{{< glossary_definition prepend="DRA is" term_id="dra" length="all" >}}
|
||||
|
||||
ResourceClaim
|
||||
: Describes a request for access to resources in the cluster,
|
||||
for use by workloads. For example, if a workload needs an accelerator device
|
||||
with specific properties, this is how that request is expressed. The status
|
||||
stanza tracks whether this claim has been satisfied and what specific
|
||||
resources have been allocated.
|
||||
Allocating resources with DRA is a similar experience to
|
||||
[dynamic volume provisioning](/docs/concepts/storage/dynamic-provisioning/), in
|
||||
which you use PersistentVolumeClaims to claim storage capacity from storage
|
||||
classes and request the claimed capacity in your Pods.
|
||||
|
||||
ResourceClaimTemplate
|
||||
: Defines the spec and some metadata for creating
|
||||
ResourceClaims. Created by a user when deploying a workload.
|
||||
The per-Pod ResourceClaims are then created and removed by Kubernetes
|
||||
automatically.
|
||||
### Benefits of DRA {#dra-benefits}
|
||||
|
||||
DRA provides a flexible way to categorize, request, and use devices in your
|
||||
cluster. Using DRA provides benefits like the following:
|
||||
|
||||
* **Flexible device filtering**: use common expression language (CEL) to perform
|
||||
fine-grained filtering for specific device attributes.
|
||||
* **Device sharing**: share the same resource with multiple containers or Pods
|
||||
by referencing the corresponding resource claim.
|
||||
* **Centralized device categorization**: device drivers and cluster admins can
|
||||
use device classes to provide app operators with hardware categories that are
|
||||
optimized for various use cases. For example, you can create a cost-optimized
|
||||
device class for general-purpose workloads, and a high-performance device
|
||||
class for critical jobs.
|
||||
* **Simplified Pod requests**: with DRA, app operators don't need to specify
|
||||
device quantities in Pod resource requests. Instead, the Pod references a
|
||||
resource claim, and the device configuration in that claim applies to the Pod.
|
||||
|
||||
These benefits provide significant improvements in the device allocation
|
||||
workflow when compared to
|
||||
[device plugins](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/),
|
||||
which require per-container device requests, don't support device sharing, and
|
||||
don't support expression-based device filtering.
|
||||
|
||||
### Types of DRA users {#dra-user-types}
|
||||
|
||||
The workflow of using DRA to allocate devices involves the following types of
|
||||
users:
|
||||
|
||||
* **Device owner**: responsible for devices. Device owners might be commercial
|
||||
vendors, the cluster operator, or another entity. To use DRA, devices must
|
||||
have DRA-compatible drivers that do the following:
|
||||
|
||||
* Create ResourceSlices that provide Kubernetes with information about
|
||||
nodes and resources.
|
||||
* Update ResourceSlices when resource capacity in the cluster changes.
|
||||
* Optionally, create DeviceClasses that workload operators can use to
|
||||
claim devices.
|
||||
|
||||
* **Cluster admin**: responsible for configuring clusters and nodes,
|
||||
attaching devices, installing drivers, and similar tasks. To use DRA,
|
||||
cluster admins do the following:
|
||||
|
||||
* Attach devices to nodes.
|
||||
* Install device drivers that support DRA.
|
||||
* Optionally, create DeviceClasses that workload operators can use to claim
|
||||
devices.
|
||||
|
||||
* **Workload operator**: responsible for deploying and managing workloads in the
|
||||
cluster. To use DRA to allocate devices to Pods, workload operators do the
|
||||
following:
|
||||
|
||||
* Create ResourceClaims or ResourceClaimTemplates to request specific
|
||||
configurations within DeviceClasses.
|
||||
* Deploy workloads that use specific ResourceClaims or ResourceClaimTemplates.
|
||||
|
||||
## DRA terminology {#terminology}
|
||||
|
||||
DRA uses the following Kubernetes API kinds to provide the core allocation
|
||||
functionality. All of these API kinds are included in the
|
||||
`resource.k8s.io/v1beta1`
|
||||
{{< glossary_tooltip text="API group" term_id="api-group" >}}.
|
||||
|
||||
DeviceClass
|
||||
: Contains pre-defined selection criteria for certain devices and
|
||||
configuration for them. DeviceClasses are created by a cluster administrator
|
||||
when installing a resource driver. Each request to allocate a device
|
||||
in a ResourceClaim must reference exactly one DeviceClass.
|
||||
: Defines a category of devices that can be claimed and how to select specific
|
||||
device attributes in claims. The DeviceClass parameters can match zero or
|
||||
more devices in ResourceSlices. To claim devices from a DeviceClass,
|
||||
ResourceClaims select specific device attributes.
|
||||
|
||||
ResourceClaim
|
||||
: Describes a request for access to attached resources, such as
|
||||
devices, in the cluster. ResourceClaims provide Pods with access to
|
||||
a specific resource. ResourceClaims can be created by workload operators
|
||||
or generated by Kubernetes based on a ResourceClaimTemplate.
|
||||
|
||||
ResourceClaimTemplate
|
||||
: Defines a template that Kubernetes uses to create per-Pod
|
||||
ResourceClaims for a workload. ResourceClaimTemplates provide Pods with
|
||||
access to separate, similar resources. Each ResourceClaim that Kubernetes
|
||||
generates from the template is bound to a specific Pod. When the Pod
|
||||
terminates, Kubernetes deletes the corresponding ResourceClaim.
|
||||
|
||||
ResourceSlice
|
||||
: Used by DRA drivers to publish information about resources (typically devices)
|
||||
that are available in the cluster.
|
||||
: Represents one or more resources that are attached to nodes, such as devices.
|
||||
Drivers create and manage ResourceSlices in the cluster. When a ResourceClaim
|
||||
is created and used in a Pod, Kubernetes uses ResourceSlices to find nodes
|
||||
that have access to the claimed resources. Kubernetes allocates resources to
|
||||
the ResourceClaim and schedules the Pod onto a node that can access the
|
||||
resources.
|
||||
|
||||
DeviceTaintRule
|
||||
: Used by admins or control plane components to add device taints
|
||||
to the devices described in ResourceSlices.
|
||||
### DeviceClass {#deviceclass}
|
||||
|
||||
All parameters that select devices are defined in the ResourceClaim and
|
||||
DeviceClass with in-tree types. Configuration parameters can be embedded there.
|
||||
Which configuration parameters are valid depends on the DRA driver -- Kubernetes
|
||||
only passes them through without interpreting them.
|
||||
A DeviceClass lets cluster admins or device drivers define categories of devices
|
||||
in the cluster. DeviceClasses tell operators what devices they can request and
|
||||
how they can request those devices. You can use
|
||||
[common expression language (CEL)](https://cel.dev) to select devices based on
|
||||
specific attributes. A ResourceClaim that references the DeviceClass can then
|
||||
request specific configurations within the DeviceClass.
|
||||
|
||||
The `core/v1` `PodSpec` defines ResourceClaims that are needed for a Pod in a
|
||||
`resourceClaims` field. Entries in that list reference either a ResourceClaim
|
||||
or a ResourceClaimTemplate. When referencing a ResourceClaim, all Pods using
|
||||
this PodSpec (for example, inside a Deployment or StatefulSet) share the same
|
||||
ResourceClaim instance. When referencing a ResourceClaimTemplate, each Pod gets
|
||||
its own instance.
|
||||
To create a DeviceClass, see
|
||||
[Set Up DRA in a Cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster).
|
||||
|
||||
The `resources.claims` list for container resources defines whether a container gets
|
||||
access to these resource instances, which makes it possible to share resources
|
||||
between one or more containers.
|
||||
### ResourceClaims and ResourceClaimTemplates {#resourceclaims-templates}
|
||||
|
||||
Here is an example for a fictional resource driver. Two ResourceClaim objects
|
||||
will get created for this Pod and each container gets access to one of them.
|
||||
A ResourceClaim defines the resources that a workload needs. Every ResourceClaim
|
||||
has _requests_ that reference a DeviceClass and select devices from that
|
||||
DeviceClass. ResourceClaims can also use _selectors_ to filter for devices that
|
||||
meet specific requirements, and can use _constraints_ to limit the devices that
|
||||
can satisfy a request. ResourceClaims can be created by workload operators or
|
||||
can be generated by Kubernetes based on a ResourceClaimTemplate. A
|
||||
ResourceClaimTemplate defines a template that Kubernetes can use to
|
||||
auto-generate ResourceClaims for Pods.
|
||||
|
||||
#### Use cases for ResourceClaims and ResourceClaimTemplates {#when-to-use-rc-rct}
|
||||
|
||||
The method that you use depends on your requirements, as follows:
|
||||
|
||||
* **ResourceClaim**: you want multiple Pods to share access to specific
|
||||
devices. You manually manage the lifecycle of ResourceClaims that you create.
|
||||
* **ResourceClaimTemplate**: you want Pods to have independent access to
|
||||
separate, similarly-configured devices. Kubernetes generates ResourceClaims
|
||||
from the specification in the ResourceClaimTemplate. The lifetime of each
|
||||
generated ResourceClaim is bound to the lifetime of the corresponding Pod.
|
||||
|
||||
When you define a workload, you can use
|
||||
{{< glossary_tooltip term_id="cel" text="Common Expression Language (CEL)" >}}
|
||||
to filter for specific device attributes or capacity. The available parameters
|
||||
for filtering depend on the device and the drivers.
|
||||
|
||||
If you directly reference a specific ResourceClaim in a Pod, that ResourceClaim
|
||||
must already exist in the same namespace as the Pod. If the ResourceClaim
|
||||
doesn't exist in the namespace, the Pod won't schedule. This behavior is similar
|
||||
to how a PersistentVolumeClaim must exist in the same namespace as a Pod that
|
||||
references it.
|
||||
|
||||
You can reference an auto-generated ResourceClaim in a Pod, but this isn't
|
||||
recommended because auto-generated ResourceClaims are bound to the lifetime of
|
||||
the Pod that triggered the generation.
|
||||
|
||||
To learn how to claim resources using one of these methods, see
|
||||
[Allocate Devices to Workloads with DRA](/docs/tasks/configure-pod-container/assign-resources/allocate-devices-dra/).
|
||||
|
||||
### ResourceSlice {#resourceslice}
|
||||
|
||||
Each ResourceSlice represents one or more
|
||||
{{< glossary_tooltip term_id="device" text="devices" >}} in a pool. The pool is
|
||||
managed by a device driver, which creates and manages ResourceSlices. The
|
||||
resources in a pool might be represented by a single ResourceSlice or span
|
||||
multiple ResourceSlices.
|
||||
|
||||
ResourceSlices provide useful information to device users and to the scheduler,
|
||||
and are crucial for dynamic resource allocation. Every ResourceSlice must include
|
||||
the following information:
|
||||
|
||||
* **Resource pool**: a group of one or more resources that the driver manages.
|
||||
The pool can span more than one ResourceSlice. Changes to the resources in a
|
||||
pool must be propagated across all of the ResourceSlices in that pool. The
|
||||
device driver that manages the pool is responsible for ensuring that this
|
||||
propagation happens.
|
||||
* **Devices**: devices in the managed pool. A ResourceSlice can list every
|
||||
device in a pool or a subset of the devices in a pool. The ResourceSlice
|
||||
defines device information like attributes, versions, and capacity. Device
|
||||
users can select devices for allocation by filtering for device information
|
||||
in ResourceClaims or in DeviceClasses.
|
||||
* **Nodes**: the nodes that can access the resources. Drivers can choose which
|
||||
nodes can access the resources, whether that's all of the nodes in the
|
||||
cluster, a single named node, or nodes that have specific node labels.
|
||||
|
||||
Drivers use a {{< glossary_tooltip text="controller" term_id="controller" >}} to
|
||||
reconcile ResourceSlices in the cluster with the information that the driver has
|
||||
to publish. This controller overwrites any manual changes, such as cluster users
|
||||
creating or modifying ResourceSlices.
|
||||
|
||||
Consider the following example ResourceSlice:
|
||||
|
||||
```yaml
|
||||
apiVersion: resource.k8s.io/v1beta2
|
||||
kind: DeviceClass
|
||||
apiVersion: resource.k8s.io/v1beta1
|
||||
kind: ResourceSlice
|
||||
metadata:
|
||||
name: resource.example.com
|
||||
name: cat-slice
|
||||
spec:
|
||||
selectors:
|
||||
- cel:
|
||||
expression: device.driver == "resource-driver.example.com"
|
||||
---
|
||||
apiVersion: resource.k8s.io/v1beta2
|
||||
kind: ResourceClaimTemplate
|
||||
metadata:
|
||||
name: large-black-cat-claim-template
|
||||
spec:
|
||||
spec:
|
||||
devices:
|
||||
requests:
|
||||
- name: req-0
|
||||
exactly:
|
||||
deviceClassName: resource.example.com
|
||||
selectors:
|
||||
- cel:
|
||||
expression: |-
|
||||
device.attributes["resource-driver.example.com"].color == "black" &&
|
||||
device.attributes["resource-driver.example.com"].size == "large"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: pod-with-cats
|
||||
spec:
|
||||
containers:
|
||||
- name: container0
|
||||
image: ubuntu:20.04
|
||||
command: ["sleep", "9999"]
|
||||
resources:
|
||||
claims:
|
||||
- name: cat-0
|
||||
- name: container1
|
||||
image: ubuntu:20.04
|
||||
command: ["sleep", "9999"]
|
||||
resources:
|
||||
claims:
|
||||
- name: cat-1
|
||||
resourceClaims:
|
||||
- name: cat-0
|
||||
resourceClaimTemplateName: large-black-cat-claim-template
|
||||
- name: cat-1
|
||||
resourceClaimTemplateName: large-black-cat-claim-template
|
||||
driver: "resource-driver.example.com"
|
||||
pool:
|
||||
generation: 1
|
||||
name: "black-cat-pool"
|
||||
resourceSliceCount: 1
|
||||
# The allNodes field defines whether any node in the cluster can access the device.
|
||||
allNodes: true
|
||||
devices:
|
||||
- name: "large-black-cat"
|
||||
basic:
|
||||
attributes:
|
||||
color:
|
||||
string: "black"
|
||||
size:
|
||||
string: "large"
|
||||
cat:
|
||||
boolean: true
|
||||
```
|
||||
This ResourceSlice is managed by the `resource-driver.example.com` driver in the
|
||||
`black-cat-pool` pool. The `allNodes: true` field indicates that any node in the
|
||||
cluster can access the devices. There's one device in the ResourceSlice, named
|
||||
`large-black-cat`, with the following attributes:
|
||||
|
||||
## Scheduling
|
||||
* `color`: `black`
|
||||
* `size`: `large`
|
||||
* `cat`: `true`
|
||||
|
||||
The scheduler is responsible for allocating resources to a ResourceClaim whenever a pod needs
|
||||
them. It does so by retrieving the full list of available resources from
|
||||
ResourceSlice objects, tracking which of those resources have already been
|
||||
allocated to existing ResourceClaims, and then selecting from those resources
|
||||
that remain.
|
||||
A DeviceClass could select this ResourceSlice by using these attributes, and a
|
||||
ResourceClaim could filter for specific devices in that DeviceClass.
|
||||
|
||||
The only kind of supported resources at the moment are devices. A device
|
||||
instance has a name and several attributes and capacities. Devices get selected
|
||||
through CEL expressions which check those attributes and capacities. In
|
||||
addition, the set of selected devices also can be restricted to sets which meet
|
||||
certain constraints.
|
||||
## How resource allocation with DRA works {#how-it-works}
|
||||
|
||||
The chosen resource is recorded in the ResourceClaim status together with any
|
||||
vendor-specific configuration, so when a pod is about to start on a node, the
|
||||
resource driver on the node has all the information it needs to prepare the
|
||||
resource.
|
||||
The following sections describe the workflow for the various
|
||||
[types of DRA users](#dra-user-types) and for the Kubernetes system during
|
||||
dynamic resource allocation.
|
||||
|
||||
By using structured parameters, the scheduler is able to reach a decision
|
||||
without communicating with any DRA resource drivers. It is also able to
|
||||
schedule multiple pods quickly by keeping information about ResourceClaim
|
||||
allocations in memory and writing this information to the ResourceClaim objects
|
||||
in the background while concurrently binding the pod to a node.
|
||||
### Workflow for users {#user-workflow}
|
||||
|
||||
## Monitoring resources
|
||||
1. **Driver creation**: device owners or third-party entities create drivers
|
||||
that can create and manage ResourceSlices in the cluster. These drivers
|
||||
optionally also create DeviceClasses that define a category of devices and
|
||||
how to request them.
|
||||
1. **Cluster configuration**: cluster admins create clusters, attach devices to
|
||||
nodes, and install the DRA device drivers. Cluster admins optionally create
|
||||
DeviceClasses that define categories of devices and how to request them.
|
||||
1. **Resource claims**: workload operators create ResourceClaimTemplates or
|
||||
ResourceClaims that request specific device configurations within a
|
||||
DeviceClass. In the same step, workload operators modify their Kubernetes
|
||||
manifests to request those ResourceClaimTemplates or ResourceClaims.
|
||||
|
||||
The kubelet provides a gRPC service to enable discovery of dynamic resources of
|
||||
running Pods. For more information on the gRPC endpoints, see the
|
||||
[resource allocation reporting](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#monitoring-device-plugin-resources).
|
||||
### Workflow for Kubernetes {#kubernetes-workflow}
|
||||
|
||||
1. **ResourceSlice creation**: drivers in the cluster create ResourceSlices that
|
||||
represent one or more devices in a managed pool of similar devices.
|
||||
1. **Workload creation**: the cluster control plane checks new workloads for
|
||||
references to ResourceClaimTemplates or to specific ResourceClaims.
|
||||
|
||||
* If the workload uses a ResourceClaimTemplate, a controller named the
|
||||
`resourceclaim-controller` generates ResourceClaims for every Pod in the
|
||||
workload.
|
||||
* If the workload uses a specific ResourceClaim, Kubernetes checks whether
|
||||
that ResourceClaim exists in the cluster. If the ResourceClaim doesn't
|
||||
exist, the Pods won't deploy.
|
||||
|
||||
1. **ResourceSlice filtering**: for every Pod, Kubernetes checks the
|
||||
ResourceSlices in the cluster to find a device that satisfies all of the
|
||||
following criteria:
|
||||
|
||||
* The nodes that can access the resources are eligible to run the Pod.
|
||||
* The ResourceSlice has unallocated resources that match the requirements of
|
||||
the Pod's ResourceClaim.
|
||||
|
||||
1. **Resource allocation**: after finding an eligible ResourceSlice for a
|
||||
Pod's ResourceClaim, the Kubernetes scheduler updates the ResourceClaim
|
||||
with the allocation details.
|
||||
1. **Pod scheduling**: when resource allocation is complete, the scheduler
|
||||
places the Pod on a node that can access the allocated resource. The device
|
||||
driver and the kubelet on that node configure the device and the Pod's access
|
||||
to the device.
|
||||
|
||||
## Observability of dynamic resources {#observability-dynamic-resources}
|
||||
|
||||
You can check the status of dynamically allocated resources by using any of the
|
||||
following methods:
|
||||
|
||||
* [kubelet device metrics](#monitoring-resources)
|
||||
* [ResourceClaim status](#resourceclaim-device-status)
|
||||
|
||||
### kubelet device metrics {#monitoring-resources}
|
||||
|
||||
The `PodResourcesLister` kubelet gRPC service lets you monitor in-use devices.
|
||||
The `DynamicResource` message provides information that's specific to dynamic
|
||||
resource allocation, such as the device name and the claim name. For details,
|
||||
see
|
||||
[Monitoring device plugin resources](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#monitoring-device-plugin-resources).
|
||||
|
||||
### ResourceClaim device status {#resourceclaim-device-status}
|
||||
|
||||
{{< feature-state feature_gate_name="DRAResourceClaimDeviceStatus" >}}
|
||||
|
||||
DRA drivers can report driver-specific
|
||||
[device status](/docs/concepts/overview/working-with-objects/#object-spec-and-status)
|
||||
data for each allocated device in the `status.devices` field of a ResourceClaim.
|
||||
For example, the driver might list the IP addresses that are assigned to a
|
||||
network interface device.
|
||||
|
||||
The accuracy of the information that a driver adds to a ResourceClaim
|
||||
`status.devices` field depends on the driver. Evaluate drivers to decide whether
|
||||
you can rely on this field as the only source of device information.
|
||||
|
||||
If you disable the `DRAResourceClaimDeviceStatus`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/), the
|
||||
`status.devices` field automatically gets cleared when storing the ResourceClaim.
|
||||
A ResourceClaim device status is supported when it is possible, from a DRA
|
||||
driver, to update an existing ResourceClaim where the `status.devices` field is
|
||||
set.
|
||||
|
||||
For details about the `status.devices` field, see the
|
||||
{{< api-reference page="workload-resources/resource-claim-v1beta1" anchor="ResourceClaimStatus" text="ResourceClaim" >}} API reference.
|
||||
|
||||
## Pre-scheduled Pods
|
||||
|
||||
|
@ -225,7 +379,17 @@ spec:
|
|||
You may also be able to mutate the incoming Pod, at admission time, to unset
|
||||
the `.spec.nodeName` field and to use a node selector instead.
|
||||
|
||||
## Admin access
|
||||
## DRA alpha features {#alpha-features}
|
||||
|
||||
The following sections describe DRA features that are available in the Alpha
|
||||
[feature stage](/docs/reference/command-line-tools-reference/feature-gates/#feature-stages).
|
||||
To use any of these features, you must also set up DRA in your clusters by
|
||||
enabling the DynamicResourceAllocation feature gate and the DRA
|
||||
{{< glossary_tooltip text="API groups" term_id="api-group" >}}. For more
|
||||
information, see
|
||||
[Set up DRA in the cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster/).
|
||||
|
||||
### Admin access {#admin-access}
|
||||
|
||||
{{< feature-state feature_gate_name="DRAAdminAccess" >}}
|
||||
|
||||
|
@ -258,26 +422,9 @@ multi-tenant clusters. Starting with Kubernetes v1.33, only users authorized to
|
|||
create ResourceClaim or ResourceClaimTemplate objects in namespaces labeled with
|
||||
`resource.k8s.io/admin-access: "true"` (case-sensitive) can use the
|
||||
`adminAccess` field. This ensures that non-admin users cannot misuse the
|
||||
feature.
|
||||
feature.
|
||||
|
||||
## ResourceClaim Device Status
|
||||
|
||||
{{< feature-state feature_gate_name="DRAResourceClaimDeviceStatus" >}}
|
||||
|
||||
The drivers can report driver-specific device status data for each allocated device
|
||||
in a resource claim. For example, IPs assigned to a network interface device can be
|
||||
reported in the ResourceClaim status.
|
||||
|
||||
The drivers setting the status, the accuracy of the information depends on the implementation
|
||||
of those DRA Drivers. Therefore, the reported status of the device may not always reflect the
|
||||
real time changes of the state of the device.
|
||||
|
||||
When the feature is disabled, that field automatically gets cleared when storing the ResourceClaim.
|
||||
|
||||
A ResourceClaim device status is supported when it is possible, from a DRA driver, to update an
|
||||
existing ResourceClaim where the `status.devices` field is set.
|
||||
|
||||
## Prioritized List
|
||||
### Prioritized list {#prioritized-list}
|
||||
|
||||
{{< feature-state feature_gate_name="DRAPrioritizedList" >}}
|
||||
|
||||
|
@ -321,7 +468,11 @@ spec:
|
|||
count: 2
|
||||
```
|
||||
|
||||
## Partitionable Devices
|
||||
Prioritized lists is an *alpha feature* and only enabled when the
|
||||
`DRAPrioritizedList` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver and kube-scheduler.
|
||||
|
||||
### Partitionable devices {#partitionable-devices}
|
||||
|
||||
{{< feature-state feature_gate_name="DRAPartitionableDevices" >}}
|
||||
|
||||
|
@ -374,7 +525,12 @@ spec:
|
|||
value: 6Gi
|
||||
```
|
||||
|
||||
## Device taints and tolerations
|
||||
Partitionable devices is an *alpha feature* and only enabled when the
|
||||
`DRAPartitionableDevices`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver and kube-scheduler.
|
||||
|
||||
### Device taints and tolerations {#device-taints-and-tolerations}
|
||||
|
||||
{{< feature-state feature_gate_name="DRADeviceTaints" >}}
|
||||
|
||||
|
@ -408,15 +564,22 @@ Allocating a device with admin access (described [above](#admin-access))
|
|||
is not exempt either. An admin using that mode must explicitly tolerate all taints
|
||||
to access tainted devices.
|
||||
|
||||
Taints can be added to devices in two different ways:
|
||||
Device taints and tolerations is an *alpha feature* and only enabled when the
|
||||
`DRADeviceTaints` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver, kube-controller-manager and kube-scheduler.
|
||||
To use DeviceTaintRules, the `resource.k8s.io/v1alpha3` API version must be
|
||||
enabled.
|
||||
|
||||
### Taints set by the driver
|
||||
You can add taints to devices in the following ways, by using the
|
||||
DeviceTaintRule API kind.
|
||||
|
||||
#### Taints set by the driver
|
||||
|
||||
A DRA driver can add taints to the device information that it publishes in ResourceSlices.
|
||||
Consult the documentation of a DRA driver to learn whether the driver uses taints and what
|
||||
their keys and values are.
|
||||
|
||||
### Taints set by an admin
|
||||
#### Taints set by an admin
|
||||
|
||||
An admin or a control plane component can taint devices without having to tell
|
||||
the DRA driver to include taints in its device information in ResourceSlices. They do that by
|
||||
|
@ -463,84 +626,10 @@ spec:
|
|||
effect: NoExecute
|
||||
```
|
||||
|
||||
## Enabling dynamic resource allocation
|
||||
|
||||
Dynamic resource allocation is a *beta feature* which is off by default and only enabled when the
|
||||
`DynamicResourceAllocation` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
and the `resource.k8s.io/v1beta1` and `resource.k8s.io/v1beta2` {{< glossary_tooltip text="API groups" term_id="api-group" >}}
|
||||
are enabled. For details on that, see the `--feature-gates` and `--runtime-config`
|
||||
[kube-apiserver parameters](/docs/reference/command-line-tools-reference/kube-apiserver/).
|
||||
kube-scheduler, kube-controller-manager and kubelet also need the feature gate.
|
||||
|
||||
When a resource driver reports the status of the devices, then the
|
||||
`DRAResourceClaimDeviceStatus` feature gate has to be enabled in addition to
|
||||
`DynamicResourceAllocation`.
|
||||
|
||||
A quick check whether a Kubernetes cluster supports the feature is to list
|
||||
DeviceClass objects with:
|
||||
|
||||
```shell
|
||||
kubectl get deviceclasses
|
||||
```
|
||||
|
||||
If your cluster supports dynamic resource allocation, the response is either a
|
||||
list of DeviceClass objects or:
|
||||
|
||||
```
|
||||
No resources found
|
||||
```
|
||||
|
||||
If not supported, this error is printed instead:
|
||||
|
||||
```
|
||||
error: the server doesn't have a resource type "deviceclasses"
|
||||
```
|
||||
|
||||
The default configuration of kube-scheduler enables the "DynamicResources"
|
||||
plugin if and only if the feature gate is enabled and when using
|
||||
the v1 configuration API. Custom configurations may have to be modified to
|
||||
include it.
|
||||
|
||||
In addition to enabling the feature in the cluster, a resource driver also has to
|
||||
be installed. Please refer to the driver's documentation for details.
|
||||
|
||||
### Enabling admin access
|
||||
|
||||
[Admin access](#admin-access) is an *alpha feature* and only enabled when the
|
||||
`DRAAdminAccess` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver and kube-scheduler.
|
||||
|
||||
### Enabling Device Status
|
||||
|
||||
[ResourceClaim Device Status](#resourceclaim-device-status) is an *alpha feature*
|
||||
and only enabled when the `DRAResourceClaimDeviceStatus`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver.
|
||||
|
||||
### Enabling Prioritized List
|
||||
|
||||
[Prioritized List](#prioritized-list)) is an *alpha feature* and only enabled when the
|
||||
`DRAPrioritizedList` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver and kube-scheduler. It also requires that the
|
||||
`DynamicResourceAllocation` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled.
|
||||
|
||||
### Enabling Partitionable Devices
|
||||
|
||||
[Partitionable Devices](#partitionable-devices) is an *alpha feature*
|
||||
and only enabled when the `DRAPartitionableDevices`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver and kube-scheduler.
|
||||
|
||||
### Enabling device taints and tolerations
|
||||
|
||||
[Device taints and tolerations](#device-taints-and-tolerations) is an *alpha feature* and only enabled when the
|
||||
`DRADeviceTaints` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
is enabled in the kube-apiserver, kube-controller-manager and kube-scheduler. To use DeviceTaintRules, the
|
||||
`resource.k8s.io/v1alpha3` API version must be enabled.
|
||||
|
||||
## {{% heading "whatsnext" %}}
|
||||
|
||||
- [Set Up DRA in a Cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster/)
|
||||
- [Allocate devices to workloads using DRA](/docs/tasks/configure-pod-container/assign-resources/allocate-devices-dra/)
|
||||
- For more information on the design, see the
|
||||
[Dynamic Resource Allocation with Structured Parameters](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4381-dra-structured-parameters)
|
||||
KEP.
|
||||
KEP.
|
|
@ -6,9 +6,9 @@ weight: 100
|
|||
|
||||
{{<glossary_definition term_id="node-pressure-eviction" length="short">}}</br>
|
||||
|
||||
{{< feature-state feature_gate_name="KubeletSeparateDiskGC" >}}
|
||||
|
||||
{{<note>}}
|
||||
{{< feature-state feature_gate_name="KubeletSeparateDiskGC" >}}
|
||||
The _split image filesystem_ feature, which enables support for the `containerfs`
|
||||
filesystem, adds several new eviction signals, thresholds and metrics. To use
|
||||
`containerfs`, the Kubernetes release v{{< skew currentVersion >}} requires the
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
---
|
||||
title: "Hardening Guide - Scheduler Configuration"
|
||||
description: >
|
||||
Information about how to make the Kubernetes scheduler more secure.
|
||||
content_type: concept
|
||||
weight: 90
|
||||
---
|
||||
|
||||
<!-- overview -->
|
||||
The Kubernetes {{< glossary_tooltip text="scheduler" term_id="kube-scheduler" >}} is
|
||||
one of the critical components of the
|
||||
{{< glossary_tooltip text="control plane" term_id="control-plane" >}}.
|
||||
|
||||
This document covers how to improve the security posture of the Scheduler.
|
||||
|
||||
A misconfigured scheduler can have security implications.
|
||||
Such a scheduler can target specific nodes and evict the workloads or applications that are sharing the node and its resources.
|
||||
This can aid an attacker with a [Yo-Yo attack](https://arxiv.org/abs/2105.00542): an attack on a vulnerable autoscaler.
|
||||
|
||||
<!-- body -->
|
||||
## kube-scheduler configuration
|
||||
|
||||
### Scheduler authentication & authorization command line options
|
||||
|
||||
When setting up authentication configuration, it should be made sure that kube-scheduler's authentication remains consistent with kube-api-server's authentication.
|
||||
If any request has missing authentication headers,
|
||||
the [authentication should happen through the kube-api-server allowing all authentication to be consistent in the cluster](/docs/tasks/extend-kubernetes/configure-aggregation-layer/#original-request-username-and-group).
|
||||
|
||||
- `authentication-kubeconfig`: Make sure to provide a proper kubeconfig so that the scheduler can retrieve authentication configuration options from the API Server. This kubeconfig file should be protected with strict file permissions.
|
||||
- `authentication-tolerate-lookup-failure`: Set this to `false` to make sure the scheduler _always_ looks up its authentication configuration from the API server.
|
||||
- `authentication-skip-lookup`: Set this to `false` to make sure the scheduler _always_ looks up its authentication configuration from the API server.
|
||||
- `authorization-always-allow-paths`: These paths should respond with data that is appropriate for anonymous authorization. Defaults to `/healthz,/readyz,/livez`.
|
||||
- `profiling`: Set to `false` to disable the profiling endpoints which are provide debugging information but which should not be enabled on production clusters as they present a risk of denial of service or information leakage. The `--profiling` argument is deprecated and can now be provided through the [KubeScheduler DebuggingConfiguration](https://kubernetes.io/docs/reference/config-api/kube-scheduler-config.v1/#DebuggingConfiguration). Profiling can be disabled through the kube-scheduler config by setting `enableProfiling` to `false`.
|
||||
- `requestheader-client-ca-file`: Avoid passing this argument.
|
||||
|
||||
|
||||
### Scheduler networking command line options
|
||||
|
||||
- `bind-address`: In most cases, the kube-scheduler does not need to be externally accessible. Setting the bind address to `localhost` is a secure practice.
|
||||
- `permit-address-sharing`: Set this to `false` to disable connection sharing through `SO_REUSEADDR`. `SO_REUSEADDR` can lead to reuse of terminated connections that are in `TIME_WAIT` state.
|
||||
- `permit-port-sharing`: Default `false`. Use the default unless you are confident you understand the security implications.
|
||||
|
||||
|
||||
### Scheduler TLS command line options
|
||||
|
||||
- `tls-cipher-suites`: Always provide a list of preferred cipher suites. This ensures encryption never happens with insecure cipher suites.
|
||||
|
||||
|
||||
## Scheduling configurations for custom schedulers
|
||||
|
||||
When using custom schedulers based on the Kubernetes scheduling code, cluster administrators need to be careful with
|
||||
plugins that use the `queueSort`, `prefilter`, `filter`, or `permit` [extension points](/docs/reference/scheduling/config/#extension-points).
|
||||
These extension points control various stages of a scheduling process, and the wrong configuration can impact the kube-scheduler's behavior in your cluster.
|
||||
|
||||
### Key considerations
|
||||
|
||||
- Exactly one plugin that uses the `queueSort` extension point can be enabled at a time. Any plugins that use `queueSort` should be scrutinized.
|
||||
- Plugins that implement the `prefilter` or `filter` extension point can potentially mark all nodes as unschedulable. This can bring scheduling of new pods to a halt.
|
||||
- Plugins that implement the `permit` extension point can prevent or delay the binding of a Pod. Such plugins should be thoroughly reviewed by the cluster administrator.
|
||||
|
||||
When using a plugin that is not one of the [default plugins](/docs/reference/scheduling/config/#scheduling-plugins), consider disabling the `queueSort`, `filter` and `permit` extension points as follows:
|
||||
|
||||
```yaml
|
||||
apiVersion: kubescheduler.config.k8s.io/v1
|
||||
kind: KubeSchedulerConfiguration
|
||||
profiles:
|
||||
- schedulerName: my-scheduler
|
||||
plugins:
|
||||
# Disable specific plugins for different extension points
|
||||
# You can disable all plugins for an extension point using "*"
|
||||
queueSort:
|
||||
disabled:
|
||||
- name: "*" # Disable all queueSort plugins
|
||||
# - name: "PrioritySort" # Disable specific queueSort plugin
|
||||
filter:
|
||||
disabled:
|
||||
- name: "*" # Disable all filter plugins
|
||||
# - name: "NodeResourcesFit" # Disable specific filter plugin
|
||||
permit:
|
||||
disabled:
|
||||
- name: "*" # Disables all permit plugins
|
||||
# - name: "TaintToleration" # Disable specific permit plugin
|
||||
```
|
||||
This creates a scheduler profile ` my-custom-scheduler`.
|
||||
Whenever the `.spec` of a Pod does not have a value for `.spec.schedulerName`, the kube-scheduler runs for that Pod,
|
||||
using its main configuration, and default plugins.
|
||||
If you define a Pod with `.spec.schedulerName` set to `my-custom-scheduler`, the kube-scheduler runs but with a custom configuration; in that custom configuration,
|
||||
the `queueSort`, `filter` and `permit` extension points are disabled.
|
||||
If you use this KubeSchedulerConfiguration, and don't run any custom scheduler,
|
||||
and you then define a Pod with `.spec.schedulerName` set to `nonexistent-scheduler`
|
||||
(or any other scheduler name that doesn't exist in your cluster), no events would be generated for a pod.
|
||||
|
||||
## Disallow labeling nodes
|
||||
|
||||
A cluster administrator should ensure that cluster users cannot label the nodes.
|
||||
A malicious actor can use `nodeSelector` to schedule workloads on nodes where those workloads should not be present.
|
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
reviewers:
|
||||
- lmktfy
|
||||
title: Security For Linux Nodes
|
||||
content_type: concept
|
||||
weight: 40
|
||||
---
|
||||
|
||||
<!-- overview -->
|
||||
|
||||
This page describes security considerations and best practices specific to the Linux operating system.
|
||||
|
||||
<!-- body -->
|
||||
|
||||
## Protection for Secret data on nodes
|
||||
|
||||
On Linux nodes, memory-backed volumes (such as [`secret`](/docs/concepts/configuration/secret/)
|
||||
volume mounts, or [`emptyDir`](/docs/concepts/storage/volumes/#emptydir) with `medium: Memory`)
|
||||
are implemented with a `tmpfs` filesystem.
|
||||
|
||||
If you have swap configured and use an older Linux kernel (or a current kernel and an unsupported configuration of Kubernetes),
|
||||
**memory** backed volumes can have data written to persistent storage.
|
||||
|
||||
The Linux kernel officially supports the `noswap` option from version 6.3,
|
||||
therefore it is recommended the used kernel version is 6.3 or later,
|
||||
or supports the `noswap` option via a backport, if swap is enabled on the node.
|
||||
|
||||
Read [swap memory management](/docs/concepts/cluster-administration/swap-memory-management/#memory-backed-volumes)
|
||||
for more info.
|
|
@ -262,8 +262,6 @@ to ensure that a PersistentVolume cannot be reused across different namespaces.
|
|||
|
||||
### Sandboxing containers
|
||||
|
||||
{{% thirdparty-content %}}
|
||||
|
||||
Kubernetes pods are composed of one or more containers that execute on worker nodes.
|
||||
Containers utilize OS-level virtualization and hence offer a weaker isolation boundary than
|
||||
virtual machines that utilize hardware-based virtualization.
|
||||
|
@ -286,14 +284,7 @@ workloads running in a shared cluster. Running workloads in a sandbox environmen
|
|||
insulate the host from container escapes, where an attacker exploits a vulnerability to gain
|
||||
access to the host system and all the processes/files running on that host.
|
||||
|
||||
Virtual machines and userspace kernels are two popular approaches to sandboxing. The following
|
||||
sandboxing implementations are available:
|
||||
|
||||
* [gVisor](https://gvisor.dev/) intercepts syscalls from containers and runs them through a
|
||||
userspace kernel, written in Go, with limited access to the underlying host.
|
||||
* [Kata Containers](https://katacontainers.io/) provide a secure container runtime that allows you to run
|
||||
containers in a VM. The hardware virtualization available in Kata offers an added layer of
|
||||
security for containers running untrusted code.
|
||||
Virtual machines and userspace kernels are two popular approaches to sandboxing.
|
||||
|
||||
### Node Isolation
|
||||
|
||||
|
@ -320,8 +311,7 @@ corresponding toleration can run on them. A mutating webhook could then be used
|
|||
add tolerations and node affinities to pods deployed into tenant namespaces so that they run on a
|
||||
specific set of nodes designated for that tenant.
|
||||
|
||||
Node isolation can be implemented using an [pod node selectors](/docs/concepts/scheduling-eviction/assign-pod-node/)
|
||||
or a [Virtual Kubelet](https://github.com/virtual-kubelet).
|
||||
Node isolation can be implemented using [pod node selectors](/docs/concepts/scheduling-eviction/assign-pod-node/).
|
||||
|
||||
## Additional Considerations
|
||||
|
||||
|
@ -411,8 +401,6 @@ Specifically, the Operator should:
|
|||
|
||||
## Implementations
|
||||
|
||||
{{% thirdparty-content %}}
|
||||
|
||||
There are two primary ways to share a Kubernetes cluster for multi-tenancy: using Namespaces
|
||||
(that is, a Namespace per tenant) or by virtualizing the control plane (that is, virtual control
|
||||
plane per tenant).
|
||||
|
@ -456,27 +444,6 @@ resources between them. This could include managing namespace labels, namespace
|
|||
delegated access, and shared resource quotas across related namespaces. These capabilities can
|
||||
be useful in both multi-team and multi-customer scenarios.
|
||||
|
||||
Some third-party projects that provide capabilities like this and aid in managing namespaced resources are
|
||||
listed below.
|
||||
|
||||
{{% thirdparty-content %}}
|
||||
|
||||
#### Multi-team tenancy
|
||||
|
||||
* [Capsule](https://github.com/clastix/capsule)
|
||||
* [Multi Tenant Operator](https://docs.stakater.com/mto/)
|
||||
|
||||
#### Multi-customer tenancy
|
||||
|
||||
* [Kubeplus](https://github.com/cloud-ark/kubeplus)
|
||||
|
||||
#### Policy engines
|
||||
|
||||
Policy engines provide features to validate and generate tenant configurations:
|
||||
|
||||
* [Kyverno](https://kyverno.io/)
|
||||
* [OPA/Gatekeeper](https://github.com/open-policy-agent/gatekeeper)
|
||||
|
||||
### Virtual control plane per tenant
|
||||
|
||||
Another form of control-plane isolation is to use Kubernetes extensions to provide each tenant a
|
||||
|
@ -508,11 +475,3 @@ The improved isolation comes at the cost of running and maintaining an individu
|
|||
plane per tenant. In addition, per-tenant control planes do not solve isolation problems in the
|
||||
data plane, such as node-level noisy neighbors or security threats. These must still be addressed
|
||||
separately.
|
||||
|
||||
The Kubernetes [Cluster API - Nested (CAPN)](https://github.com/kubernetes-sigs/cluster-api-provider-nested/tree/main/virtualcluster)
|
||||
project provides an implementation of virtual control planes.
|
||||
|
||||
#### Other implementations
|
||||
|
||||
* [Kamaji](https://github.com/clastix/kamaji)
|
||||
* [vcluster](https://github.com/loft-sh/vcluster)
|
||||
|
|
|
@ -87,6 +87,11 @@ the data.
|
|||
For a list of supported providers, refer to
|
||||
[Providers for the Secret Store CSI Driver](https://secrets-store-csi-driver.sigs.k8s.io/concepts.html#provider-for-the-secrets-store-csi-driver).
|
||||
|
||||
## Good practices for using swap memory
|
||||
|
||||
For best practices for setting swap memory for Linux nodes, please refer to
|
||||
[swap memory management](/docs/concepts/cluster-administration/swap-memory-management/#good-practice-for-using-swap-in-a-kubernetes-cluster).
|
||||
|
||||
## Developers
|
||||
|
||||
This section provides good practices for developers to use to improve the
|
||||
|
|
|
@ -975,7 +975,7 @@ spec:
|
|||
|
||||
## Resources
|
||||
|
||||
The storage media (such as Disk or SSD) of an `emptyDir` volume is determined by the
|
||||
The storage medium (such as Disk or SSD) of an `emptyDir` volume is determined by the
|
||||
medium of the filesystem holding the kubelet root dir (typically
|
||||
`/var/lib/kubelet`). There is no limit on how much space an `emptyDir` or
|
||||
`hostPath` volume can consume, and no isolation between containers or
|
||||
|
|
|
@ -324,10 +324,10 @@ kernel patch.
|
|||
|
||||
### Mirantis Container Runtime {#mcr}
|
||||
|
||||
[Mirantis Container Runtime](https://docs.mirantis.com/mcr/20.10/overview.html) (MCR)
|
||||
[Mirantis Container Runtime](https://docs.mirantis.com/mcr/25.0/overview.html) (MCR)
|
||||
is available as a container runtime for all Windows Server 2019 and later versions.
|
||||
|
||||
See [Install MCR on Windows Servers](https://docs.mirantis.com/mcr/20.10/install/mcr-windows.html) for more information.
|
||||
See [Install MCR on Windows Servers](https://docs.mirantis.com/mcr/25.0/install/mcr-windows.html) for more information.
|
||||
|
||||
## Windows OS version compatibility {#windows-os-version-support}
|
||||
|
||||
|
|
|
@ -1183,7 +1183,7 @@ replacement will be created immediately (even if the old Pod is still in a Termi
|
|||
#### Rolling Update Deployment
|
||||
|
||||
The Deployment updates Pods in a rolling update
|
||||
fashion when `.spec.strategy.type==RollingUpdate`. You can specify `maxUnavailable` and `maxSurge` to control
|
||||
fashion (gradually scale down the old ReplicaSets and scale up the new one) when `.spec.strategy.type==RollingUpdate`. You can specify `maxUnavailable` and `maxSurge` to control
|
||||
the rolling update process.
|
||||
|
||||
##### Max Unavailable
|
||||
|
@ -1202,7 +1202,7 @@ at all times during the update is at least 70% of the desired Pods.
|
|||
|
||||
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the maximum number of Pods
|
||||
that can be created over the desired number of Pods. The value can be an absolute number (for example, 5) or a
|
||||
percentage of desired Pods (for example, 10%). The value cannot be 0 if `MaxUnavailable` is 0. The absolute number
|
||||
percentage of desired Pods (for example, 10%). The value cannot be 0 if `maxUnavailable` is 0. The absolute number
|
||||
is calculated from the percentage by rounding up. The default value is 25%.
|
||||
|
||||
For example, when this value is set to 30%, the new ReplicaSet can be scaled up immediately when the
|
||||
|
@ -1331,7 +1331,7 @@ a Pod is considered ready, see [Container Probes](/docs/concepts/workloads/pods/
|
|||
|
||||
{{< feature-state feature_gate_name="DeploymentReplicaSetTerminatingReplicas" >}}
|
||||
|
||||
You can enable this feature it by setting the `DeploymentReplicaSetTerminatingReplicas`
|
||||
You can enable this feature by setting the `DeploymentReplicaSetTerminatingReplicas`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/)
|
||||
and on the [kube-controller-manager](/docs/reference/command-line-tools-reference/kube-controller-manager/)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
reviewers:
|
||||
- alculquicondor
|
||||
- erictune
|
||||
- mimowo
|
||||
- soltysh
|
||||
title: Jobs
|
||||
api_metadata:
|
||||
|
|
|
@ -1,4 +1,10 @@
|
|||
---
|
||||
# NOTE TO LOCALIZATION TEAMS
|
||||
#
|
||||
# If updating front matter for your localization because there is still
|
||||
# a "feature" key in this page, then you also need to update
|
||||
# content/??/docs/concepts/architecture/self-healing.md (which is where
|
||||
# it moved to)
|
||||
reviewers:
|
||||
- Kashomon
|
||||
- bprashanth
|
||||
|
@ -7,13 +13,6 @@ title: ReplicaSet
|
|||
api_metadata:
|
||||
- apiVersion: "apps/v1"
|
||||
kind: "ReplicaSet"
|
||||
feature:
|
||||
title: Self-healing
|
||||
anchor: How a ReplicaSet works
|
||||
description: >
|
||||
Restarts containers that fail, replaces and reschedules containers when nodes die,
|
||||
kills containers that don't respond to your user-defined health check,
|
||||
and doesn't advertise them to clients until they are ready to serve.
|
||||
content_type: concept
|
||||
description: >-
|
||||
A ReplicaSet's purpose is to maintain a stable set of replica Pods running at any given time.
|
||||
|
@ -324,7 +323,7 @@ ReplicaSets do not support a rolling update directly.
|
|||
|
||||
{{< feature-state feature_gate_name="DeploymentReplicaSetTerminatingReplicas" >}}
|
||||
|
||||
You can enable this feature it by setting the `DeploymentReplicaSetTerminatingReplicas`
|
||||
You can enable this feature by setting the `DeploymentReplicaSetTerminatingReplicas`
|
||||
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
|
||||
on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/)
|
||||
and on the [kube-controller-manager](/docs/reference/command-line-tools-reference/kube-controller-manager/)
|
||||
|
|