Compare commits

..

No commits in common. "main" and "snapshot-initial-v1.33" have entirely different histories.

1194 changed files with 18591 additions and 113142 deletions

View File

@ -21,6 +21,3 @@ insert_final_newline = true
[Makefile]
indent_style = tab
[OWNERS]
indent_size = 2

View File

@ -1,14 +1,10 @@
aliases:
sig-docs-blog-owners: # Approvers for blog content
- lmktfy
- graz-dev
- mrbobbytables
- natalisucks
- nate-double-u
sig-docs-blog-reviewers: # Reviewers for blog content
- Gauravpadam
- graz-dev
- lmktfy
- mrbobbytables
- natalisucks
- nate-double-u
@ -16,7 +12,6 @@ aliases:
- dipesh-rawat
- divya-mohan0209
- katcosgrove
- lmktfy
- natalisucks
- nate-double-u
- reylejano
@ -59,9 +54,9 @@ aliases:
- dipesh-rawat
- divya-mohan0209
- katcosgrove
- lmktfy
- natalisucks
- nate-double-u
- rayandas # RT 1.33 Docs Lead
- reylejano
- salaxander
- tengqm
@ -69,7 +64,7 @@ aliases:
- dipesh-rawat
- divya-mohan0209
- katcosgrove
- lmktfy
- kbhawkey
- mengjiao-liu
- natalisucks
- nate-double-u
@ -78,6 +73,8 @@ aliases:
- shannonxtreme
- tengqm
- windsonsea
- Princesso
- drewhagen
sig-docs-es-owners: # Admins for Spanish content
- electrocucaracha
- krol3
@ -104,18 +101,15 @@ aliases:
- bishal7679
- dipesh-rawat
- divya-mohan0209
- jayeshmahajan
- niranjandarshann
sig-docs-id-owners: # Admins for Indonesian content
- ariscahyadi
- girikuncoro
- habibrosyad
- za
sig-docs-id-reviews: # PR reviews for Indonesian content
- ariscahyadi
- girikuncoro
- habibrosyad
- za
sig-docs-it-owners: # Admins for Italian content
- fabriziopandini
- Fale
@ -215,11 +209,9 @@ aliases:
- mfilocha
- nvtkaszpir
sig-docs-uk-owners: # Admins for Ukrainian content
- Andygol
- Arhell
- MaxymVlasov
sig-docs-uk-reviews: # PR reviews for Ukrainian content
- Andygol
- Arhell
- idvoretskyi
- MaxymVlasov

View File

@ -4,12 +4,13 @@
# reviewers to review and approve.
# Teams and members are visible at https://github.com/orgs/kubernetes/teams.
reviewers:
- sig-docs-en-reviews
approvers:
- sig-docs-en-owners
filters:
".*":
reviewers:
- sig-docs-en-reviews
approvers:
- sig-docs-en-owners
"\\.svg":
labels:
- area/web-development

View File

@ -1,25 +0,0 @@
let splitInstance = null;
function enableSplitter(mediaQuery) {
if (mediaQuery.matches) {
if (!splitInstance) {
splitInstance = Split(["#sidebarnav", "#maindoc"], {
sizes: [20, 80],
minSize: 100,
});
}
} else {
if (splitInstance) {
splitInstance.destroy();
splitInstance = null;
}
}
}
const screenWidthMediaQuery = window.matchMedia("(min-width: 768px)");
const eleNav = document.getElementById("sidebarnav");
if (eleNav !== null) {
enableSplitter(screenWidthMediaQuery);
screenWidthMediaQuery.addListener(enableSplitter);
}

View File

@ -11,6 +11,9 @@ $quickstart-button-padding: 0 50px;
$vendor-strip-height: 88px;
$vendor-strip-font-size: 16px;
// video
$video-section-height: 200px;
@import "size";
@import "documentation";
@ -253,6 +256,9 @@ $ocean-nodes-padding-Y: 60px;
$ocean-nodes-main-margin-bottom: 60px;
$ocean-nodes-h3-margin-bottom: 30px;
// video
$video-section-height: 200px;
// Home-specific
.header-hero {
@ -311,10 +317,13 @@ $ocean-nodes-h3-margin-bottom: 30px;
}
// Video thingy
#video {
height: $video-section-height;
}
#video {
width: 100%;
position: relative;
overflow: hidden;
background-position: center center;
background-size: cover;
@ -417,10 +426,6 @@ $ocean-nodes-h3-margin-bottom: 30px;
}
}
#video:has(#desktopKCButton) {
height: 580px;
}
#videoPlayer {
@include fullScreen;
background-color: rgba(0, 0, 0, 0.9);

View File

@ -50,6 +50,35 @@ body {
}
}
/* Gutter for sidebar splitter */
.gutter {
background-color: #eee;
background-repeat: no-repeat;
background-position: 50%;
}
.gutter.gutter-horizontal {
background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAeCAYAAADkftS9AAAAIklEQVQoU2M4c+bMfxAGAgYYmwGrIIiDjrELjpo5aiZeMwF+yNnOs5KSvgAAAABJRU5ErkJggg==');
cursor: col-resize;
}
#sidebarnav,
#maindoc {
max-width: 100%;
}
#maindoc {
overflow-wrap: break-word;
}
@media (max-width: 768px) {
#sidebarnav {
padding-left: 15px;
padding-right: 15px;
}
}
/* Complex table layout support */
.td-content, body.td-content {
@ -245,7 +274,7 @@ footer {
}
// Custom footer sizing
@media (min-width: 800px) and (max-width: 1285px) {
@media (min-width: 800px) and (max-width: 1279px) {
footer {
ul.footer-icons {
min-width: 17.5vw;
@ -253,11 +282,6 @@ footer {
flex-wrap: nowrap;
flex-direction: row;
justify-content: space-evenly;
li.mx-2 {
margin-left: 0.3rem !important;
margin-right: 0.3rem !important;
}
}
.col-sm-2 {
flex: 0 0 22.5%;
@ -1352,6 +1376,42 @@ body.cid-code-of-conduct main {
}
}
// search & sidebar
.td-sidebar {
@media only screen and (min-width: 768px) {
padding-top: 1.5rem !important;
.td-sidebar__inner {
top: 8.5rem;
@media only screen and (min-width: 1075px) {
top: 6.5rem;
}
}
}
}
.td-sidebar-nav {
& > .td-sidebar-nav__section {
padding-top: .5rem;
padding-left: 1.5rem;
}
}
.td-sidebar__inner {
form.td-sidebar__search {
.td-sidebar__toggle {
&:hover {
color: #000000;
}
color: $primary;
margin: 1rem;
}
}
}
.no-underline {
text-decoration: none !important;
}
@ -1360,6 +1420,16 @@ body.cid-code-of-conduct main {
display: none !important;
}
.td-sidebar-link__page {
&#m-docs-search {
display: none;
}
&#m-docs-test {
display: none;
}
}
//Tutorials
main.content {
position: inherit;
@ -1387,13 +1457,6 @@ main.content {
margin-bottom: 20px;
}
/* CAREERS */
// Set 14px font size for GitJobs attribution text
.gitjobs-legend {
font-size: 14px;
}
/* CASE-STUDIES */
// Many of the case studies have small variations in markup and styles;
@ -1860,58 +1923,6 @@ body.td-search {
color: #ffffff !important;
}
body.td-home section.case-studies {
h2, h3 {
text-align: center;
}
.case-study-list {
display: flex;
flex-direction: row;
max-width: 80vw;
margin-left: auto;
margin-right: auto;
align-items: stretch;
gap: clamp(1rem, 4em, 10vw);
> .case-study-item {
display: flex;
flex-direction: column;
justify-content: space-between;
text-align: center;
width: clamp(6rem, 20%, 50vw);
picture, picture img {
height: 4.8rem;
text-align: center;
}
> a {
display: block;
text-align: right;
}
}
padding-bottom: 2em;
}
padding-top: 4rem;
}
@media screen and (max-width: 768px) {
.case-study-list {
justify-content: center;
flex-wrap: wrap;
> .case-study-item {
min-width: 34vw;
}
}
}
@media screen and (max-width: 650px) {
.case-study-list {
> .case-study-item {
min-width: 51vw;
}
}
}
// handle main page features on narrow viewports
@media screen and (max-width: 768px) {
.features-container div.feature-box {
@ -1942,4 +1953,4 @@ section.k8s-birthday-override:has(div.k8s-birthday-override.revert-to-previous i
@extend .table;
}
}
}
}

View File

@ -1,5 +1,6 @@
$main-max-width: 1200px;
$vendor-strip-height: 44px;
$video-section-height: 580px;
@media screen and (min-width: 1024px) {
@ -49,12 +50,13 @@ $vendor-strip-height: 44px;
}
#video {
height: $video-section-height;
position: relative;
background-position: top center;
background-position: center center;
background-size: cover;
&>.light-text {
margin: 0 10% 15px 0;
margin-right: 10%;
}
}

View File

@ -1,75 +0,0 @@
.td-sidebar-nav {
.td-sidebar-link.tree-root {
display: none;
}
#navbarDropdownMenuLink {
display: none;
}
}
/* Gutter for sidebar splitter */
.gutter {
background-color: #eee;
background-repeat: no-repeat;
background-position: 50%;
&.gutter-horizontal {
background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAeCAYAAADkftS9AAAAIklEQVQoU2M4c+bMfxAGAgYYmwGrIIiDjrELjpo5aiZeMwF+yNnOs5KSvgAAAABJRU5ErkJggg==');
cursor: col-resize;
}
}
#sidebarnav,
#maindoc {
max-width: 100%;
}
#maindoc {
overflow-wrap: break-word;
}
@include media-breakpoint-down(sm) {
#sidebarnav {
padding-left: 15px;
padding-right: 15px;
}
}
// search & sidebar
.td-sidebar {
@include media-breakpoint-up(sm){
padding-top: 1.5rem !important;
.td-sidebar__inner {
top: 8.5rem;
@media only screen and (min-width: 1075px) {
top: 6.5rem;
}
}
}
}
.td-sidebar-nav {
& > .td-sidebar-nav__section {
padding-top: .5rem;
padding-left: 1.5rem;
}
}
.td-sidebar__inner form.td-sidebar__search {
.td-sidebar__toggle {
&:hover {
color: #000000;
}
color: $primary;
}
}
.td-sidebar-link__page {
&#m-docs-test {
display: none;
}
}

View File

@ -9,7 +9,6 @@ Add styles or import other files. */
// Base styles
@import "k8s_community";
@import "k8s_nav";
@import "k8s_sidebar-tree";
//Media queries
@import "base";

View File

@ -7,6 +7,9 @@ $headline-wrapper-margin-bottom: 40px;
$quickstart-button-padding: 0 50px;
$vendor-strip-font-size: 16px;
//video
$video-section-height: 400px;
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -110,13 +113,15 @@ $vendor-strip-font-size: 16px;
}
#video {
height: $video-section-height;
display: block;
height: 550px;
& > .light-text {
display: block;
float: right;
text-align: left;
margin: 0 5% 15px 0;
margin-right: 5%;
}
}

View File

@ -43,11 +43,17 @@ Google সপ্তাহে বিলিয়ন কন্টেইনার
<h2>150+ মাইক্রোসার্ভিস কুবারনেটিসে স্থানান্তরিত করার চ্যালেঞ্জ</h2>
<p>সারাহ ওয়েলস দ্বারা, অপারেশনস এবং নির্ভরযোগ্যতার জন্য প্রযুক্তিগত পরিচালক, ফিনান্সিয়াল টাইমস</p>
<button id="desktopShowVideoButton" onclick="kub.showVideo()">ভিডিও দেখুন</button>
<h3>আসন্ন KubeCon + CloudNativeCon ইভেন্টগুলিতে যোগ দিন</h3>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mar 23-26, 2026)</a>
<br>
<br>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" button id="desktopKCButton">12-15 নভেম্বর KubeCon + CloudNativeCon North America তে যোগ দিন</a>
<br>
<br>
<br>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" button id="desktopKCButton">11-12 ডিসেম্বর KubeCon + CloudNativeCon India তে যোগ দিন</a>
<br>
<br>
<br>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" button id="desktopKCButton">1-4 এপ্রিল, 2025-এ KubeCon + CloudNativeCon Europe তে যোগ দিন</a>
</div>
<div id="videoPlayer">
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>
@ -58,5 +64,3 @@ Google সপ্তাহে বিলিয়ন কন্টেইনার
{{< blocks/kubernetes-features >}}
{{< blocks/case-studies >}}
{{< kubeweekly id="kubeweekly" >}}

View File

@ -27,7 +27,7 @@ case_study_details:
<p>"Every single product, every decision we make at Ancestry, focuses on delighting our customers with intimate, sometimes life-changing discoveries about themselves and their families," says MacKay. "As the company continues to grow, the increased productivity gains from using Kubernetes has helped Ancestry make customer discoveries faster. With the move to Dockerization for example, instead of taking between 20 to 50 minutes to deploy a new piece of code, we can now deploy in under a minute for much of our code. We've truly experienced significant time savings in addition to the various features and benefits from cloud native and Kubernetes-type technologies."</p>
{{< case-studies/quote author="PAUL MACKAY, SOFTWARE ENGINEER AND ARCHITECT AT ANCESTRY" >}}
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
{{< /case-studies/quote >}}
{{< case-studies/lead >}}
@ -48,7 +48,7 @@ It started with a Shaky Leaf.
<p>That need led them in 2015 to explore containerization. Ancestry engineers had already been using technology like <a href="https://www.java.com/en/">Java</a> and <a href="https://www.python.org">Python</a> on Linux, so part of the decision was about making the infrastructure more Linux-friendly. They quickly decided that they wanted to go with Docker for containerization, "but it always comes down to the orchestration part of it to make it really work," says MacKay.</p>
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="https://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="http://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
{{< case-studies/lead >}}
Plus, MacKay says, "I just believed in the confidence that comes with the history that Google has with containerization. So we started out right on the leading edge of it. And we haven't looked back since."

View File

@ -42,9 +42,9 @@ With its end-to-end commerce platform for cloud-based products and services, <a
<p>When Director of Software Development Pierre-Alexandre Lacerte started working there in 2014, the company had a monolith application deployed on a "tomcat infrastructure, and the whole release process was complex for what it should be," he says. "There were a lot of manual steps involved, with one engineer building a feature then creating a pull request, and a QA or another engineer validating the feature. Then it gets merged and someone else will take care of the deployment. So we had bottlenecks in the pipeline to ship a feature to production."</p>
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="https://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="http://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
{{< case-studies/quote
{{< case-studies/quote
image="/images/case-studies/appdirect/banner3.jpg"
author="Alexandre Gervais, Staff Software Developer, AppDirect"
>}}
@ -61,7 +61,7 @@ With its end-to-end commerce platform for cloud-based products and services, <a
<p>Lacerte's strategy ultimately worked because of the very real impact the Kubernetes platform has had to deployment time. Due to less dependency on custom-made, brittle shell scripts with SCP commands, time to deploy a new version has shrunk from 4 hours to a few minutes. Additionally, the company invested a lot of effort to make things self-service for developers. "Onboarding a new service doesn't require <a href="https://www.atlassian.com/software/jira">Jira</a> tickets or meeting with three different teams," says Lacerte. Today, the company sees 1,600 deployments per week, compared to 1-30 before.</p>
{{< case-studies/quote
{{< case-studies/quote
image="/images/case-studies/appdirect/banner4.jpg"
author="Pierre-Alexandre Lacerte, Director of Software Development, AppDirect"
>}}

View File

@ -20,7 +20,7 @@ case_study_details:
<h2>Solution</h2>
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="https://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="http://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
<h2>Impact</h2>

View File

@ -20,7 +20,7 @@ case_study_details:
<h2>Solution</h2>
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="https://kubernetes.io/">Kubernetes.</a></p>
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="http://kubernetes.io/">Kubernetes.</a></p>
<h2>Impact</h2>
@ -50,7 +50,7 @@ It's not every day that you can say you've slashed an operating expense by half.
<p>GolfNow's dev team ran an "internal, low-key" proof of concept and were won over. "We really liked how easy it was to be able to pass containers around to each other and have them up and running in no time, exactly the way it was running on my machine," says Sheriff. "Because that is always the biggest gripe that Ops has with developers, right? 'It worked on my machine!' But then we started getting to the point of, 'How do we make sure that these things stay up and running?'"</p>
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="https://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="http://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
<p>But before they could go with Kubernetes, <a href="http://www.nbc.com/">NBC</a>, GolfNow's parent company, also asked them to comparison shop with another company. Sheriff and his team liked the competing company's platform user interface, but didn't like that its platform would not allow containers to run natively on Docker. With no clear decision in sight, Sheriff's VP at GolfNow, Steve McElwee, set up a three-month trial during which a GolfNow team (consisting of Sheriff and Josh, who's now Lead Architect, Open Platforms) would build out a Kubernetes environment, and a large NBC team would build out one with the other company's platform.</p>

View File

@ -355,7 +355,7 @@ kubelet স্বয়ংক্রিয়ভাবে প্রতিটি
আপনার কাছে [সাইডকার কন্টেইনার](/bn/docs/concepts/workloads/pods/sidecar-containers/) থাকতে পারে
যেগুলি প্রধান অ্যাপ্লিকেশন পডকে সহায়ক পরিষেবা প্রদান করে (উদাহরণস্বরূপ: একটি পরিষেবা মেশ)।
{{< feature-state feature_gate_name="SidecarContainers" >}}
{{< feature-state for_k8s_version="v1.29" state="beta" >}}
ডিফল্টরূপে সক্রিয় করা হয়েছে, `SidecarContainers` [ফিচার গেট](/bn/docs/reference/command-line-tools-reference/feature-gates/)
init কন্টেইনারগুলির জন্য আপনাকে `restartPolicy: Always` নির্দিষ্ট করতে দেয়।

View File

@ -15,7 +15,7 @@ Bash-এর জন্য kubectl কমপ্লিশন স্ক্রিপ
কিন্তু, kubectl কমপ্লিসন স্ক্রিপ্ট নির্ভর করে [**bash-completion**](https://github.com/scop/bash-completion) যা আপনাকে আগে ইনস্টল করতে হবে।
{{< warning>}}
bash-completion এর দুটি সংস্করণ আছে, v1 এবং v2। V1 Bash 3.2 এর জন্য (যা macOS-এ ডিফল্ট), এবং v2 হল Bash 4.1+ এর জন্য। kubectl পূর্ণতা স্ক্রিপ্ট ** কাজ করে না** সঠিকভাবে bash-completion v1 এবং Bash 3.2 এর সাথে। এর জন্য **ব্যাশ-সম্পূর্ণ v2** এবং **ব্যাশ 4.1+** প্রয়োজন। সুতরাং, macOS-এ kubectl সমাপ্তি সঠিকভাবে ব্যবহার করতে সক্ষম হতে, আপনাকে Bash 4.1+ ([*instructions*](https://apple.stackexchange.com/a/292760)) ইনস্টল এবং ব্যবহার করতে হবে। নিম্নলিখিত নির্দেশাবলী অনুমান করে যে আপনি Bash 4.1+ ব্যবহার করেন (অর্থাৎ, 4.1 বা তার পরবর্তী যেকোনো Bash সংস্করণ)।
bash-completion এর দুটি সংস্করণ আছে, v1 এবং v2। V1 Bash 3.2 এর জন্য (যা macOS-এ ডিফল্ট), এবং v2 হল Bash 4.1+ এর জন্য। kubectl পূর্ণতা স্ক্রিপ্ট ** কাজ করে না** সঠিকভাবে bash-completion v1 এবং Bash 3.2 এর সাথে। এর জন্য **ব্যাশ-সম্পূর্ণ v2** এবং **ব্যাশ 4.1+** প্রয়োজন। সুতরাং, macOS-এ kubectl সমাপ্তি সঠিকভাবে ব্যবহার করতে সক্ষম হতে, আপনাকে Bash 4.1+ ([*instructions*](https://itnext.io/upgrading-bash-on-macos-7138bd1066ba)) ইনস্টল এবং ব্যবহার করতে হবে। নিম্নলিখিত নির্দেশাবলী অনুমান করে যে আপনি Bash 4.1+ ব্যবহার করেন (অর্থাৎ, 4.1 বা তার পরবর্তী যেকোনো Bash সংস্করণ)।
{{< /warning >}}
### Bash আপগ্রেড করুন

View File

@ -51,7 +51,7 @@ metadata:
namespace: kube-system
data:
my-scheduler-config.yaml: |
apiVersion: kubescheduler.config.k8s.io/v1
apiVersion: kubescheduler.config.k8s.io/v1beta2
kind: KubeSchedulerConfiguration
profiles:
- schedulerName: my-scheduler

View File

@ -25,7 +25,7 @@ spec:
app: mysql
spec:
containers:
- image: mysql:9
- image: mysql:5.6
name: mysql
env:
# Use secret in real usage

View File

@ -6,6 +6,23 @@ type: kubernetes.io/tls
data:
# values are base64 encoded, which obscures them but does NOT provide
# any useful level of confidentiality
# Note: Replace the following values with your own base64-encoded certificate and key.
tls.crt: "REPLACE_WITH_BASE64_CERT"
tls.key: "REPLACE_WITH_BASE64_KEY"
tls.crt: |
LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUNVakNDQWJzQ0FnMytNQTBHQ1NxR1NJYjNE
UUVCQlFVQU1JR2JNUXN3Q1FZRFZRUUdFd0pLVURFT01Bd0cKQTFVRUNCTUZWRzlyZVc4eEVEQU9C
Z05WQkFjVEIwTm9kVzh0YTNVeEVUQVBCZ05WQkFvVENFWnlZVzVyTkVSRQpNUmd3RmdZRFZRUUxF
dzlYWldKRFpYSjBJRk4xY0hCdmNuUXhHREFXQmdOVkJBTVREMFp5WVc1ck5FUkVJRmRsCllpQkRR
VEVqTUNFR0NTcUdTSWIzRFFFSkFSWVVjM1Z3Y0c5eWRFQm1jbUZ1YXpSa1pDNWpiMjB3SGhjTk1U
TXcKTVRFeE1EUTFNVE01V2hjTk1UZ3dNVEV3TURRMU1UTTVXakJMTVFzd0NRWURWUVFHREFKS1VE
RVBNQTBHQTFVRQpDQXdHWEZSdmEzbHZNUkV3RHdZRFZRUUtEQWhHY21GdWF6UkVSREVZTUJZR0Ex
VUVBd3dQZDNkM0xtVjRZVzF3CmJHVXVZMjl0TUlHYU1BMEdDU3FHU0liM0RRRUJBUVVBQTRHSUFE
Q0JoQUo5WThFaUhmeHhNL25PbjJTbkkxWHgKRHdPdEJEVDFKRjBReTliMVlKanV2YjdjaTEwZjVN
Vm1UQllqMUZTVWZNOU1vejJDVVFZdW4yRFljV29IcFA4ZQpqSG1BUFVrNVd5cDJRN1ArMjh1bklI
QkphVGZlQ09PekZSUFY2MEdTWWUzNmFScG04L3dVVm16eGFLOGtCOWVaCmhPN3F1TjdtSWQxL2pW
cTNKODhDQXdFQUFUQU5CZ2txaGtpRzl3MEJBUVVGQUFPQmdRQU1meTQzeE15OHh3QTUKVjF2T2NS
OEtyNWNaSXdtbFhCUU8xeFEzazlxSGtyNFlUY1JxTVQ5WjVKTm1rWHYxK2VSaGcwTi9WMW5NUTRZ
RgpnWXcxbnlESnBnOTduZUV4VzQyeXVlMFlHSDYyV1hYUUhyOVNVREgrRlowVnQvRGZsdklVTWRj
UUFEZjM4aU9zCjlQbG1kb3YrcE0vNCs5a1h5aDhSUEkzZXZ6OS9NQT09Ci0tLS0tRU5EIENFUlRJ
RklDQVRFLS0tLS0K
# In this example, the key data is not a real PEM-encoded private key
tls.key: |
RXhhbXBsZSBkYXRhIGZvciB0aGUgVExTIGNydCBmaWVsZA==

View File

@ -85,7 +85,7 @@ type: docs
GitHub অ্যাক্সেস নিয়ন্ত্রণ: [@kubernetes/release-managers](https://github.com/orgs/kubernetes/teams/release-managers)
GitHub উল্লেখ: @kubernetes/release-engineering
GitHub উল্লেখ: [@kubernetes/release-engineering](https://github.com/orgs/kubernetes/teams/release-engineering)
- Adolfo García Veytia ([@puerco](https://github.com/puerco))
- Cici Huang ([@cici37](https://github.com/cici37))

View File

@ -41,11 +41,11 @@ Kubernetes ist Open Source und bietet Dir die Freiheit, die Infrastruktur vor Or
<button id="desktopShowVideoButton" onclick="kub.showVideo()">Video ansehen</button>
<h3>Nehmen Sie an der kommenden KubeCon + CloudNativeCon teil</h3>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" class="desktopKCButton"><strong>Europe</strong> (London, Apr 1-4)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-china/" class="desktopKCButton"><strong>China</strong> (Hongkong, Jun 10-11)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-japan/" class="desktopKCButton"><strong>Japan</strong> (Tokio, Jun 16-17)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mrz 23-26, 2026)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america-2025/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
</div>
<div id="videoPlayer">
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>

View File

@ -345,7 +345,7 @@ Beide Ansätze sind gleichwertig. Nach dem erneuten Laden der Shell sollte kubec
{{% tab name="Bash auf macOS" %}}
{{< warning>}}
macOS beinhaltet standardmäßig Bash 3.2. Das kubectl-Vervollständigunsskript erfordert Bash 4.1+ und funktioniert nicht mit Bash 3.2. Um dies zu umgehen, können Sie eine neuere Version von Bash unter macOS installieren (folgen Sie den Anweisungen [hier](https://apple.stackexchange.com/a/292760)). Die folgenden Anweisungen funktionieren nur, wenn Sie Bash 4.1 oder höher verwenden.
macOS beinhaltet standardmäßig Bash 3.2. Das kubectl-Vervollständigunsskript erfordert Bash 4.1+ und funktioniert nicht mit Bash 3.2. Um dies zu umgehen, können Sie eine neuere Version von Bash unter macOS installieren (folgen Sie den Anweisungen [hier](https://itnext.io/upgrading-bash-on-macos-7138bd1066ba)). Die folgenden Anweisungen funktionieren nur, wenn Sie Bash 4.1 oder höher verwenden.
{{< /warning >}}
### Einführung

View File

@ -45,9 +45,11 @@ To download Kubernetes, visit the [download](/releases/download/) section.
<button id="desktopShowVideoButton" onclick="kub.showVideo()">Watch Video</button>
<h3>Attend upcoming KubeCon + CloudNativeCon events</h3>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe/" class="desktopKCButton"><strong>Europe</strong> (London, Apr 1-4)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-china/" class="desktopKCButton"><strong>China</strong> (Hong Kong, Jun 10-11)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-japan/" class="desktopKCButton"><strong>Japan</strong> (Tokyo, Jun 16-17)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-india/" class="desktopKCButton"><strong>India</strong> (Hyderabad, Aug 6-7)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-europe-2026/" class="desktopKCButton"><strong>Europe</strong> (Amsterdam, Mar 23-26, 2026)</a>
<a href="https://events.linuxfoundation.org/kubecon-cloudnativecon-north-america-2025/" class="desktopKCButton"><strong>North America</strong> (Atlanta, Nov 10-13)</a>
</div>
<div id="videoPlayer">
<iframe data-url="https://www.youtube.com/embed/H06qrNmGqyE?autoplay=1" frameborder="0" allowfullscreen></iframe>

View File

@ -17,7 +17,7 @@ The GA milestone indicates that Kubernetes users may depend on the feature and i
Although prior to CSI Kubernetes provided a powerful volume plugin system, it was challenging to add support for new volume plugins to Kubernetes: volume plugins were “in-tree” meaning their code was part of the core Kubernetes code and shipped with the core Kubernetes binaries—vendors wanting to add support for their storage system to Kubernetes (or even fix a bug in an existing volume plugin) were forced to align with the Kubernetes release process. In addition, third-party storage code caused reliability and security issues in core Kubernetes binaries and the code was often difficult (and in some cases impossible) for Kubernetes maintainers to test and maintain.
CSI was developed as a standard for exposing arbitrary block and file storage systems to containerized workloads on Container Orchestration Systems (COs) like Kubernetes. With the adoption of the Container Storage Interface, the Kubernetes volume layer becomes truly extensible. Using CSI, third-party storage providers can write and deploy plugins exposing new storage systems in Kubernetes without ever having to touch the core Kubernetes code. This gives Kubernetes users more options for storage and makes the system more secure and reliable.
CSI was developed as a standard for exposing arbitrary block and file storage storage systems to containerized workloads on Container Orchestration Systems (COs) like Kubernetes. With the adoption of the Container Storage Interface, the Kubernetes volume layer becomes truly extensible. Using CSI, third-party storage providers can write and deploy plugins exposing new storage systems in Kubernetes without ever having to touch the core Kubernetes code. This gives Kubernetes users more options for storage and makes the system more secure and reliable.
## Whats new?

View File

@ -90,12 +90,3 @@ In the test, we created 400 Secrets, each containing 1 MB of data, and used info
The results were alarming, only 16 informers were needed to cause the test server to run out of memory and crash, demonstrating how quickly memory consumption can grow under such conditions.
Special shout out to [@deads2k](https://github.com/deads2k) for his help in shaping this feature.
## Kubernetes 1.33 update
Since this feature was started, [Marek Siarkowicz](https://github.com/serathius) integrated a new technology into the
Kubernetes API server: _streaming collection encoding_.
Kubernetes v1.33 introduced two related feature gates, `StreamingCollectionEncodingToJSON` and `StreamingCollectionEncodingToProtobuf`.
These features encode via a stream and avoid allocating all the memory at once.
This functionality is bit-for-bit compatible with existing **list** encodings, produces even greater server-side memory savings, and doesn't require any changes to client code.
In 1.33, the `WatchList` feature gate is disabled by default.

View File

@ -7,7 +7,7 @@ author: >
Tabitha Sable (Kubernetes Security Response Committee)
---
Today, the ingress-nginx maintainers have released patches for a batch of critical vulnerabilities that could make it easy for attackers to take over your Kubernetes cluster: [ingress-nginx v1.12.1](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.12.1) and [ingress-nginx v1.11.5](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.11.5). If you are among the over 40% of Kubernetes administrators using [ingress-nginx](https://github.com/kubernetes/ingress-nginx/), you should take action immediately to protect your users and data.
Today, the ingress-nginx maintainers have [released patches for a batch of critical vulnerabilities](https://github.com/kubernetes/ingress-nginx/releases) that could make it easy for attackers to take over your Kubernetes cluster. If you are among the over 40% of Kubernetes administrators using [ingress-nginx](https://github.com/kubernetes/ingress-nginx/), you should take action immediately to protect your users and data.
## Background
@ -23,7 +23,7 @@ Four of todays ingress-nginx vulnerabilities are improvements to how ingress-
The most serious of todays vulnerabilities, [CVE-2025-1974](https://github.com/kubernetes/kubernetes/issues/131009), rated [9.8 CVSS](https://www.first.org/cvss/calculator/3-1#CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H), allows anything on the Pod network to exploit configuration injection vulnerabilities via the Validating Admission Controller feature of ingress-nginx. This makes such vulnerabilities far more dangerous: ordinarily one would need to be able to create an Ingress object in the cluster, which is a fairly privileged action. When combined with todays other vulnerabilities, **CVE-2025-1974 means that anything on the Pod network has a good chance of taking over your Kubernetes cluster, with no credentials or administrative access required**. In many common scenarios, the Pod network is accessible to all workloads in your cloud VPC, or even anyone connected to your corporate network\! This is a very serious situation.
Today, we have released [ingress-nginx v1.12.1](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.12.1) and [ingress-nginx v1.11.5](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.11.5), which have fixes for all five of these vulnerabilities.
Today, we have [released ingress-nginx v1.12.1 and v1.11.5](https://github.com/kubernetes/ingress-nginx/releases), which have fixes for all five of these vulnerabilities.
## Your next steps
@ -52,5 +52,3 @@ Thanks go out to Nir Ohfeld, Sagi Tzadik, Ronen Shustin, and Hillai Ben-Sasson f
For further information about the maintenance and future of ingress-nginx, please see this [GitHub issue](https://github.com/kubernetes/ingress-nginx/issues/13002) and/or attend [James and Marcos KubeCon/CloudNativeCon EU 2025 presentation](https://kccnceu2025.sched.com/event/1tcyc/).
For further information about the specific vulnerabilities discussed in this article, please see the appropriate GitHub issue: [CVE-2025-24513](https://github.com/kubernetes/kubernetes/issues/131005), [CVE-2025-24514](https://github.com/kubernetes/kubernetes/issues/131006), [CVE-2025-1097](https://github.com/kubernetes/kubernetes/issues/131007), [CVE-2025-1098](https://github.com/kubernetes/kubernetes/issues/131008), or [CVE-2025-1974](https://github.com/kubernetes/kubernetes/issues/131009)
*This blog post was revised in May 2025 to update the hyperlinks.*

View File

@ -25,7 +25,7 @@ release; make sure to read about those if you already run an older version of Ku
{{< figure src="k8s-1.33.svg" alt="Kubernetes v1.33 logo: Octarine" class="release-logo" >}}
The theme for Kubernetes v1.33 is **Octarine: The Color of Magic**<sup>1</sup>, inspired by Terry
Pratchetts _Discworld_ series. This release highlights the open source magic<sup>2</sup> that
Pratchetts _Discworld_ series. This release highlights the open-source magic<sup>2</sup> that
Kubernetes enables across the ecosystem.
If youre familiar with the world of Discworld, you might recognize a small swamp dragon perched
@ -38,7 +38,7 @@ release is a reminder that, as Pratchett wrote, _“Its still magic even if y
Even if you know the ins and outs of the Kubernetes code base, stepping back at the end of the
release cycle, youll realize that Kubernetes remains magical.
Kubernetes v1.33 is a testament to the enduring power of open source innovation, where hundreds of
Kubernetes v1.33 is a testament to the enduring power of open-source innovation, where hundreds of
contributors<sup>4</sup> from around the world work together to create something truly
extraordinary. Behind every new feature, the Kubernetes community works to maintain and improve the
project, ensuring it remains secure, reliable, and released on time. Each release builds upon the

View File

@ -1,7 +1,8 @@
---
layout: blog
title: "Kubernetes v1.33: User Namespaces enabled by default!"
date: 2025-04-25T10:30:00-08:00
date: 2025-04-23
draft: true
slug: userns-enabled-by-default
author: >
Rodrigo Campos Catelin (Microsoft),

View File

@ -1,106 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Image Volumes graduate to beta!"
date: 2025-04-29T10:30:00-08:00
slug: kubernetes-v1-33-image-volume-beta
author: Sascha Grunert (Red Hat)
---
[Image Volumes](/blog/2024/08/16/kubernetes-1-31-image-volume-source) were
introduced as an Alpha feature with the Kubernetes v1.31 release as part of
[KEP-4639](https://github.com/kubernetes/enhancements/issues/4639). In Kubernetes v1.33, this feature graduates to **beta**.
Please note that the feature is still _disabled_ by default, because not all
[container runtimes](/docs/setup/production-environment/container-runtimes) have
full support for it. [CRI-O](https://cri-o.io) supports the initial feature since version v1.31 and
will add support for Image Volumes as beta in v1.33.
[containerd merged](https://github.com/containerd/containerd/pull/10579) support
for the alpha feature which will be part of the v2.1.0 release and is working on
beta support as part of [PR #11578](https://github.com/containerd/containerd/pull/11578).
### What's new
The major change for the beta graduation of Image Volumes is the support for
[`subPath`](/docs/concepts/storage/volumes/#using-subpath) and
[`subPathExpr`](/docs/concepts/storage/volumes/#using-subpath-expanded-environment) mounts
for containers via `spec.containers[*].volumeMounts.[subPath,subPathExpr]`. This
allows end-users to mount a certain subdirectory of an image volume, which is
still mounted as readonly (`noexec`). This means that non-existing
subdirectories cannot be mounted by default. As for other `subPath` and
`subPathExpr` values, Kubernetes will ensure that there are no absolute path or
relative path components part of the specified sub path. Container runtimes are
also required to double check those requirements for safety reasons. If a
specified subdirectory does not exist within a volume, then runtimes should fail
on container creation and provide user feedback by using existing kubelet
events.
Besides that, there are also three new kubelet metrics available for image volumes:
- `kubelet_image_volume_requested_total`: Outlines the number of requested image volumes.
- `kubelet_image_volume_mounted_succeed_total`: Counts the number of successful image volume mounts.
- `kubelet_image_volume_mounted_errors_total`: Accounts the number of failed image volume mounts.
To use an existing subdirectory for a specific image volume, just use it as
[`subPath`](/docs/concepts/storage/volumes/#using-subpath) (or
[`subPathExpr`](/docs/concepts/storage/volumes/#using-subpath-expanded-environment))
value of the containers `volumeMounts`:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: image-volume
spec:
containers:
- name: shell
command: ["sleep", "infinity"]
image: debian
volumeMounts:
- name: volume
mountPath: /volume
subPath: dir
volumes:
- name: volume
image:
reference: quay.io/crio/artifact:v2
pullPolicy: IfNotPresent
```
Then, create the pod on your cluster:
```shell
kubectl apply -f image-volumes-subpath.yaml
```
Now you can attach to the container:
```shell
kubectl attach -it image-volume bash
```
And check the content of the file from the `dir` sub path in the volume:
```shell
cat /volume/file
```
The output will be similar to:
```none
1
```
Thank you for reading through the end of this blog post! SIG Node is proud and
happy to deliver this feature graduation as part of Kubernetes v1.33.
As writer of this blog post, I would like to emphasize my special thanks to
**all** involved individuals out there!
If you would like to provide feedback or suggestions feel free to reach out
to SIG Node using the [Kubernetes Slack (#sig-node)](https://kubernetes.slack.com/messages/sig-node)
channel or the [SIG Node mailing list](https://groups.google.com/g/kubernetes-sig-node).
## Further reading
- [Use an Image Volume With a Pod](/docs/tasks/configure-pod-container/image-volumes)
- [`image` volume overview](/docs/concepts/storage/volumes/#image)

View File

@ -1,68 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Storage Capacity Scoring of Nodes for Dynamic Provisioning (alpha)"
date: 2025-04-30T10:30:00-08:00
slug: kubernetes-v1-33-storage-capacity-scoring-feature
author: >
Yuma Ogami (Cybozu)
---
Kubernetes v1.33 introduces a new alpha feature called `StorageCapacityScoring`. This feature adds a scoring method for pod scheduling
with [the topology-aware volume provisioning](/blog/2018/10/11/topology-aware-volume-provisioning-in-kubernetes/).
This feature eases to schedule pods on nodes with either the most or least available storage capacity.
## About this feature
This feature extends the kube-scheduler's VolumeBinding plugin to perform scoring using node storage capacity information
obtained from [Storage Capacity](/docs/concepts/storage/storage-capacity/). Currently, you can only filter out nodes with insufficient storage capacity.
So, you have to use a scheduler extender to achieve storage-capacity-based pod scheduling.
This feature is useful for provisioning node-local PVs, which have size limits based on the node's storage capacity. By using this feature,
you can assign the PVs to the nodes with the most available storage space so that you can expand the PVs later as much as possible.
In another use case, you might want to reduce the number of nodes as much as possible for low operation costs in cloud environments by choosing
the least storage capacity node. This feature helps maximize resource utilization by filling up nodes more sequentially, starting with the most
utilized nodes first that still have enough storage capacity for the requested volume size.
## How to use
### Enabling the feature
In the alpha phase, `StorageCapacityScoring` is disabled by default. To use this feature, add `StorageCapacityScoring=true`
to the kube-scheduler command line option `--feature-gates`.
### Configuration changes
You can configure node priorities based on storage utilization using the `shape` parameter in the VolumeBinding plugin configuration.
This allows you to prioritize nodes with higher available storage capacity (default) or, conversely, nodes with lower available storage capacity.
For example, to prioritize lower available storage capacity, configure `KubeSchedulerConfiguration` as follows:
```yaml
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
profiles:
...
pluginConfig:
- name: VolumeBinding
args:
...
shape:
- utilization: 0
score: 0
- utilization: 100
score: 10
```
For more details, please refer to the [documentation](/docs/reference/config-api/kube-scheduler-config.v1/#kubescheduler-config-k8s-io-v1-VolumeBindingArgs).
## Further reading
- [KEP-4049: Storage Capacity Scoring of Nodes for Dynamic Provisioning](https://github.com/kubernetes/enhancements/blob/master/keps/sig-storage/4049-storage-capacity-scoring-of-nodes-for-dynamic-provisioning/README.md)
## Additional note: Relationship with VolumeCapacityPriority
The alpha feature gate `VolumeCapacityPriority`, which performs node scoring based on available storage capacity during static provisioning,
will be deprecated and replaced by `StorageCapacityScoring`.
Please note that while `VolumeCapacityPriority` prioritizes nodes with lower available storage capacity by default,
`StorageCapacityScoring` prioritizes nodes with higher available storage capacity by default.

View File

@ -1,57 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: New features in DRA"
slug: kubernetes-v1-33-dra-updates
date: 2025-05-01T10:30:00-08:00
author: >
[Morten Torkildsen](https://github.com/mortent) (Google)
[Patrick Ohly](https://github.com/pohly) (Intel)
---
Kubernetes [Dynamic Resource Allocation](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/) (DRA) was originally introduced as an alpha feature in the v1.26 release, and then went through a significant redesign for Kubernetes v1.31. The main DRA feature went to beta in v1.32, and the project hopes it will be generally available in Kubernetes v1.34.
The basic feature set of DRA provides a far more powerful and flexible API for requesting devices than Device Plugin. And while DRA remains a beta feature for v1.33, the DRA team has been hard at work implementing a number of new features and UX improvements. One feature has been promoted to beta, while a number of new features have been added in alpha. The team has also made progress towards getting DRA ready for GA.
### Features promoted to beta
[Driver-owned Resource Claim Status](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#resourceclaim-device-status) was promoted to beta. This allows the driver to report driver-specific device status data for each allocated device in a resource claim, which is particularly useful for supporting network devices.
### New alpha features
[Partitionable Devices](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#partitionable-devices) lets a driver advertise several overlapping logical devices (“partitions”), and the driver can reconfigure the physical device dynamically based on the actual devices allocated. This makes it possible to partition devices on-demand to meet the needs of the workloads and therefore increase the utilization.
[Device Taints and Tolerations](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#device-taints-and-tolerations) allow devices to be tainted and for workloads to tolerate those taints. This makes it possible for drivers or cluster administrators to mark devices as unavailable. Depending on the effect of the taint, this can prevent devices from being allocated or cause eviction of pods that are using the device.
[Prioritized List](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#prioritized-list) lets users specify a list of acceptable devices for their workloads, rather than just a single type of device. So while the workload might run best on a single high-performance GPU, it might also be able to run on 2 mid-level GPUs. The scheduler will attempt to satisfy the alternatives in the list in order, so the workload will be allocated the best set of devices available in the cluster.
[Admin Access](/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#admin-access) has been updated so that only users with access to a namespace with the `resource.k8s.io/admin-access: "true"` label are authorized to create ResourceClaim or ResourceClaimTemplates objects with the `adminAccess` field within the namespace. This grants administrators access to in-use devices and may enable additional permissions when making the device available in a container. This ensures that non-admin users cannot misuse the feature.
### Preparing for general availability
A new v1beta2 API has been added to simplify the user experience and to prepare for additional features being added in the future. The RBAC rules for DRA have been improved and support has been added for seamless upgrades of DRA drivers.
### Whats next?
The plan for v1.34 is even more ambitious than for v1.33. Most importantly, we (the Kubernetes device management working group) hope to bring DRA to general availability, which will make it available by default on all v1.34 Kubernetes clusters. This also means that many, perhaps all, of the DRA features that are still beta in v1.34 will become enabled by default, making it much easier to use them.
The alpha features that were added in v1.33 will be brought to beta in v1.34.
### Getting involved
A good starting point is joining the WG Device Management [Slack channel](https://kubernetes.slack.com/archives/C0409NGC1TK) and [meetings](https://docs.google.com/document/d/1qxI87VqGtgN7EAJlqVfxx86HGKEAc2A3SKru8nJHNkQ/edit?tab=t.0#heading=h.tgg8gganowxq), which happen at US/EU and EU/APAC friendly time slots.
Not all enhancement ideas are tracked as issues yet, so come talk to us if you want to help or have some ideas yourself! We have work to do at all levels, from difficult core changes to usability enhancements in kubectl, which could be picked up by newcomers.
### Acknowledgments
A huge thanks to everyone who has contributed:
* Cici Huang ([cici37](https://github.com/cici37))
* Ed Bartosh ([bart0sh](https://github.com/bart0sh])
* John Belamaric ([johnbelamaric](https://github.com/johnbelamaric))
* Jon Huhn ([nojnhuh](https://github.com/nojnhuh))
* Kevin Klues ([klueska](https://github.com/klueska))
* Morten Torkildsen ([mortent](https://github.com/mortent))
* Patrick Ohly ([pohly](https://github.com/pohly))
* Rita Zhang ([ritazh](https://github.com/ritazh))
* Shingo Omura ([everpeace](https://github.com/everpeace))

View File

@ -1,74 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Mutable CSI Node Allocatable Count"
date: 2025-05-02T10:30:00-08:00
slug: kubernetes-1-33-mutable-csi-node-allocatable-count
author: Eddie Torres (Amazon Web Services)
---
Scheduling stateful applications reliably depends heavily on accurate information about resource availability on nodes.
Kubernetes v1.33 introduces an alpha feature called *mutable CSI node allocatable count*, allowing Container Storage Interface (CSI) drivers to dynamically update the reported maximum number of volumes that a node can handle.
This capability significantly enhances the accuracy of pod scheduling decisions and reduces scheduling failures caused by outdated volume capacity information.
## Background
Traditionally, Kubernetes CSI drivers report a static maximum volume attachment limit when initializing. However, actual attachment capacities can change during a node's lifecycle for various reasons, such as:
- Manual or external operations attaching/detaching volumes outside of Kubernetes control.
- Dynamically attached network interfaces or specialized hardware (GPUs, NICs, etc.) consuming available slots.
- Multi-driver scenarios, where one CSI drivers operations affect available capacity reported by another.
Static reporting can cause Kubernetes to schedule pods onto nodes that appear to have capacity but don't, leading to pods stuck in a `ContainerCreating` state.
## Dynamically adapting CSI volume limits
With the new feature gate `MutableCSINodeAllocatableCount`, Kubernetes enables CSI drivers to dynamically adjust and report node attachment capacities at runtime. This ensures that the scheduler has the most accurate, up-to-date view of node capacity.
### How it works
When this feature is enabled, Kubernetes supports two mechanisms for updating the reported node volume limits:
- **Periodic Updates:** CSI drivers specify an interval to periodically refresh the node's allocatable capacity.
- **Reactive Updates:** An immediate update triggered when a volume attachment fails due to exhausted resources (`ResourceExhausted` error).
### Enabling the feature
To use this alpha feature, you must enable the `MutableCSINodeAllocatableCount` feature gate in these components:
- `kube-apiserver`
- `kubelet`
### Example CSI driver configuration
Below is an example of configuring a CSI driver to enable periodic updates every 60 seconds:
```
apiVersion: storage.k8s.io/v1
kind: CSIDriver
metadata:
name: example.csi.k8s.io
spec:
nodeAllocatableUpdatePeriodSeconds: 60
```
This configuration directs Kubelet to periodically call the CSI driver's `NodeGetInfo` method every 60 seconds, updating the nodes allocatable volume count. Kubernetes enforces a minimum update interval of 10 seconds to balance accuracy and resource usage.
### Immediate updates on attachment failures
In addition to periodic updates, Kubernetes now reacts to attachment failures. Specifically, if a volume attachment fails with a `ResourceExhausted` error (gRPC code `8`), an immediate update is triggered to correct the allocatable count promptly.
This proactive correction prevents repeated scheduling errors and helps maintain cluster health.
## Getting started
To experiment with mutable CSI node allocatable count in your Kubernetes v1.33 cluster:
1. Enable the feature gate `MutableCSINodeAllocatableCount` on the `kube-apiserver` and `kubelet` components.
2. Update your CSI driver configuration by setting `nodeAllocatableUpdatePeriodSeconds`.
3. Monitor and observe improvements in scheduling accuracy and pod placement reliability.
## Next steps
This feature is currently in alpha and the Kubernetes community welcomes your feedback. Test it, share your experiences, and help guide its evolution toward beta and GA stability.
Join discussions in the [Kubernetes Storage Special Interest Group (SIG-Storage)](https://github.com/kubernetes/community/tree/master/sig-storage) to shape the future of Kubernetes storage capabilities.

View File

@ -1,127 +0,0 @@
---
layout: blog
title: 'Kubernetes v1.33: Prevent PersistentVolume Leaks When Deleting out of Order graduates to GA'
date: 2025-05-05T10:30:00-08:00
slug: kubernetes-v1-33-prevent-persistentvolume-leaks-when-deleting-out-of-order-graduate-to-ga
author: >
Deepak Kinni (Broadcom)
---
I am thrilled to announce that the feature to prevent
[PersistentVolume](/docs/concepts/storage/persistent-volumes/) (or PVs for short)
leaks when deleting out of order has graduated to General Availability (GA) in
Kubernetes v1.33! This improvement, initially introduced as a beta
feature in Kubernetes v1.31, ensures that your storage resources are properly
reclaimed, preventing unwanted leaks.
## How did reclaim work in previous Kubernetes releases?
[PersistentVolumeClaim](/docs/concepts/storage/persistent-volumes/#Introduction) (or PVC for short) is
a user's request for storage. A PV and PVC are considered [Bound](/docs/concepts/storage/persistent-volumes/#Binding)
if a newly created PV or a matching PV is found. The PVs themselves are
backed by volumes allocated by the storage backend.
Normally, if the volume is to be deleted, then the expectation is to delete the
PVC for a bound PV-PVC pair. However, there are no restrictions on deleting a PV
before deleting a PVC.
For a `Bound` PV-PVC pair, the ordering of PV-PVC deletion determines whether
the PV reclaim policy is honored. The reclaim policy is honored if the PVC is
deleted first; however, if the PV is deleted prior to deleting the PVC, then the
reclaim policy is not exercised. As a result of this behavior, the associated
storage asset in the external infrastructure is not removed.
## PV reclaim policy with Kubernetes v1.33
With the graduation to GA in Kubernetes v1.33, this issue is now resolved. Kubernetes
now reliably honors the configured `Delete` reclaim policy, even when PVs are deleted
before their bound PVCs. This is achieved through the use of finalizers,
ensuring that the storage backend releases the allocated storage resource as intended.
### How does it work?
For CSI volumes, the new behavior is achieved by adding a [finalizer](/docs/concepts/overview/working-with-objects/finalizers/) `external-provisioner.volume.kubernetes.io/finalizer`
on new and existing PVs. The finalizer is only removed after the storage from the backend is deleted. Addition or removal of finalizer is handled by `external-provisioner`
`
An example of a PV with the finalizer, notice the new finalizer in the finalizers list
```
kubectl get pv pvc-a7b7e3ba-f837-45ba-b243-dec7d8aaed53 -o yaml
```
```yaml
apiVersion: v1
kind: PersistentVolume
metadata:
annotations:
pv.kubernetes.io/provisioned-by: csi.example.driver.com
creationTimestamp: "2021-11-17T19:28:56Z"
finalizers:
- kubernetes.io/pv-protection
- external-provisioner.volume.kubernetes.io/finalizer
name: pvc-a7b7e3ba-f837-45ba-b243-dec7d8aaed53
resourceVersion: "194711"
uid: 087f14f2-4157-4e95-8a70-8294b039d30e
spec:
accessModes:
- ReadWriteOnce
capacity:
storage: 1Gi
claimRef:
apiVersion: v1
kind: PersistentVolumeClaim
name: example-vanilla-block-pvc
namespace: default
resourceVersion: "194677"
uid: a7b7e3ba-f837-45ba-b243-dec7d8aaed53
csi:
driver: csi.example.driver.com
fsType: ext4
volumeAttributes:
storage.kubernetes.io/csiProvisionerIdentity: 1637110610497-8081-csi.example.driver.com
type: CNS Block Volume
volumeHandle: 2dacf297-803f-4ccc-afc7-3d3c3f02051e
persistentVolumeReclaimPolicy: Delete
storageClassName: example-vanilla-block-sc
volumeMode: Filesystem
status:
phase: Bound
```
The [finalizer](/docs/concepts/overview/working-with-objects/finalizers/) prevents this
PersistentVolume from being removed from the
cluster. As stated previously, the finalizer is only removed from the PV object
after it is successfully deleted from the storage backend. To learn more about
finalizers, please refer to [Using Finalizers to Control Deletion](/blog/2021/05/14/using-finalizers-to-control-deletion/).
Similarly, the finalizer `kubernetes.io/pv-controller` is added to dynamically provisioned in-tree plugin volumes.
### Important note
The fix does not apply to statically provisioned in-tree plugin volumes.
## How to enable new behavior?
To take advantage of the new behavior, you must have upgraded your cluster to the v1.33 release of Kubernetes
and run the CSI [`external-provisioner`](https://github.com/kubernetes-csi/external-provisioner) version `5.0.1` or later.
The feature was released as beta in v1.31 release of Kubernetes, where it was enabled by default.
## References
* [KEP-2644](https://github.com/kubernetes/enhancements/tree/master/keps/sig-storage/2644-honor-pv-reclaim-policy)
* [Volume leak issue](https://github.com/kubernetes-csi/external-provisioner/issues/546)
* [Beta Release Blog](/blog/2024/08/16/kubernetes-1-31-prevent-persistentvolume-leaks-when-deleting-out-of-order/)
## How do I get involved?
The Kubernetes Slack channel [SIG Storage communication channels](https://github.com/kubernetes/community/blob/master/sig-storage/README.md#contact) are great mediums to reach out to the SIG Storage and migration working group teams.
Special thanks to the following people for the insightful reviews, thorough consideration and valuable contribution:
* Fan Baofa (carlory)
* Jan Šafránek (jsafrane)
* Xing Yang (xing-yang)
* Matthew Wong (wongma7)
Join the [Kubernetes Storage Special Interest Group (SIG)](https://github.com/kubernetes/community/tree/master/sig-storage) if you're interested in getting involved with the design and development of CSI or any part of the Kubernetes Storage system. Were rapidly growing and always welcome new contributors.

View File

@ -1,201 +0,0 @@
---
layout: blog
title: 'Kubernetes v1.33: Fine-grained SupplementalGroups Control Graduates to Beta'
date: 2025-05-06T10:30:00-08:00
slug: kubernetes-v1-33-fine-grained-supplementalgroups-control-beta
author: >
Shingo Omura (LY Corporation)
---
The new field, `supplementalGroupsPolicy`, was introduced as an opt-in alpha feature for Kubernetes v1.31 and has graduated to beta in v1.33; the corresponding feature gate (`SupplementalGroupsPolicy`) is now enabled by default. This feature enables to implement more precise control over supplemental groups in containers that can strengthen the security posture, particularly in accessing volumes. Moreover, it also enhances the transparency of UID/GID details in containers, offering improved security oversight.
Please be aware that this beta release contains some behavioral breaking change. See [The Behavioral Changes Introduced In Beta](#the-behavioral-changes-introduced-in-beta) and [Upgrade Considerations](#upgrade-consideration) sections for details.
## Motivation: Implicit group memberships defined in `/etc/group` in the container image
Although the majority of Kubernetes cluster admins/users may not be aware, kubernetes, by default, _merges_ group information from the Pod with information defined in `/etc/group` in the container image.
Let's see an example, below Pod manifest specifies `runAsUser=1000`, `runAsGroup=3000` and `supplementalGroups=4000` in the Pod's security context.
```yaml
apiVersion: v1
kind: Pod
metadata:
name: implicit-groups
spec:
securityContext:
runAsUser: 1000
runAsGroup: 3000
supplementalGroups: [4000]
containers:
- name: ctr
image: registry.k8s.io/e2e-test-images/agnhost:2.45
command: [ "sh", "-c", "sleep 1h" ]
securityContext:
allowPrivilegeEscalation: false
```
What is the result of `id` command in the `ctr` container? The output should be similar to this:
```none
uid=1000 gid=3000 groups=3000,4000,50000
```
Where does group ID `50000` in supplementary groups (`groups` field) come from, even though `50000` is not defined in the Pod's manifest at all? The answer is `/etc/group` file in the container image.
Checking the contents of `/etc/group` in the container image should show below:
```none
user-defined-in-image:x:1000:
group-defined-in-image:x:50000:user-defined-in-image
```
This shows that the container's primary user `1000` belongs to the group `50000` in the last entry.
Thus, the group membership defined in `/etc/group` in the container image for the container's primary user is _implicitly_ merged to the information from the Pod. Please note that this was a design decision the current CRI implementations inherited from Docker, and the community never really reconsidered it until now.
### What's wrong with it?
The _implicitly_ merged group information from `/etc/group` in the container image poses a security risk. These implicit GIDs can't be detected or validated by policy engines because there's no record of them in the Pod manifest. This can lead to unexpected access control issues, particularly when accessing volumes (see [kubernetes/kubernetes#112879](https://issue.k8s.io/112879) for details) because file permission is controlled by UID/GIDs in Linux.
## Fine-grained supplemental groups control in a Pod: `supplementaryGroupsPolicy`
To tackle the above problem, Pod's `.spec.securityContext` now includes `supplementalGroupsPolicy` field.
This field lets you control how Kubernetes calculates the supplementary groups for container processes within a Pod. The available policies are:
* _Merge_: The group membership defined in `/etc/group` for the container's primary user will be merged. If not specified, this policy will be applied (i.e. as-is behavior for backward compatibility).
* _Strict_: Only the group IDs specified in `fsGroup`, `supplementalGroups`, or `runAsGroup` are attached as supplementary groups to the container processes. Group memberships defined in `/etc/group` for the container's primary user are ignored.
Let's see how `Strict` policy works. Below Pod manifest specifies `supplementalGroupsPolicy: Strict`:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: strict-supplementalgroups-policy
spec:
securityContext:
runAsUser: 1000
runAsGroup: 3000
supplementalGroups: [4000]
supplementalGroupsPolicy: Strict
containers:
- name: ctr
image: registry.k8s.io/e2e-test-images/agnhost:2.45
command: [ "sh", "-c", "sleep 1h" ]
securityContext:
allowPrivilegeEscalation: false
```
The result of `id` command in the `ctr` container should be similar to this:
```none
uid=1000 gid=3000 groups=3000,4000
```
You can see `Strict` policy can exclude group `50000` from `groups`!
Thus, ensuring `supplementalGroupsPolicy: Strict` (enforced by some policy mechanism) helps prevent the implicit supplementary groups in a Pod.
{{<note>}}
A container with sufficient privileges can change its process identity. The `supplementalGroupsPolicy` only affect the initial process identity. See the following section for details.
{{</note>}}
## Attached process identity in Pod status
This feature also exposes the process identity attached to the first container process of the container
via `.status.containerStatuses[].user.linux` field. It would be helpful to see if implicit group IDs are attached.
```yaml
...
status:
containerStatuses:
- name: ctr
user:
linux:
gid: 3000
supplementalGroups:
- 3000
- 4000
uid: 1000
...
```
{{<note>}}
Please note that the values in `status.containerStatuses[].user.linux` field is _the firstly attached_
process identity to the first container process in the container. If the container has sufficient privilege
to call system calls related to process identity (e.g. [`setuid(2)`](https://man7.org/linux/man-pages/man2/setuid.2.html), [`setgid(2)`](https://man7.org/linux/man-pages/man2/setgid.2.html) or [`setgroups(2)`](https://man7.org/linux/man-pages/man2/setgroups.2.html), etc.), the container process can change its identity. Thus, the _actual_ process identity will be dynamic.
{{</note>}}
## `Strict` Policy requires newer CRI versions
Actually, CRI runtime (e.g. containerd, CRI-O) plays a core role for calculating supplementary group ids to be attached to the containers. Thus, `SupplementalGroupsPolicy=Strict` requires a CRI runtime that support this feature (`SupplementalGroupsPolicy: Merge` can work with the CRI runtime which does not support this feature because this policy is fully backward compatible policy).
Here are some CRI runtimes that support this feature, and the versions you need
to be running:
- containerd: v2.0 or later
- CRI-O: v1.31 or later
And, you can see if the feature is supported in the Node's `.status.features.supplementalGroupsPolicy` field.
```yaml
apiVersion: v1
kind: Node
...
status:
features:
supplementalGroupsPolicy: true
```
## The behavioral changes introduced in beta
In the alpha release, when a Pod with `supplementalGroupsPolicy: Strict` was scheduled to a node that did not support the feature (i.e., `.status.features.supplementalGroupsPolicy=false`), the Pod's supplemental groups policy silently fell back to `Merge`.
In v1.33, this has entered beta to enforce the policy more strictly, where kubelet rejects pods whose nodes cannot ensure the specified policy. If your pod is rejected, you will see warning events with `reason=SupplementalGroupsPolicyNotSupported` like below:
```yaml
apiVersion: v1
kind: Event
...
type: Warning
reason: SupplementalGroupsPolicyNotSupported
message: "SupplementalGroupsPolicy=Strict is not supported in this node"
involvedObject:
apiVersion: v1
kind: Pod
...
```
## Upgrade consideration
If you're already using this feature, especially the `supplementalGroupsPolicy: Strict` policy, we assume that your cluster's CRI runtimes already support this feature. In that case, you don't need to worry about the pod rejections described above.
However, if your cluster:
- uses the `supplementalGroupsPolicy: Strict` policy, but
- its CRI runtimes do NOT yet support the feature (i.e., `.status.features.supplementalGroupsPolicy=false`),
you need to prepare the behavioral changes (pod rejection) when upgrading your cluster.
We recommend several ways to avoid unexpected pod rejections:
- Upgrading your cluster's CRI runtimes together with kubernetes or before the upgrade
- Putting some label to your nodes describing CRI runtime supports this feature or not and also putting label selector to pods with `Strict` policy to select such nodes (but, you will need to monitor the number of `Pending` pods in this case instead of pod rejections).
## Getting involved
This feature is driven by the [SIG Node](https://github.com/kubernetes/community/tree/master/sig-node) community.
Please join us to connect with the community and share your ideas and feedback around the above feature and
beyond. We look forward to hearing from you!
## How can I learn more?
<!-- https://github.com/kubernetes/website/pull/46920 -->
- [Configure a Security Context for a Pod or Container](/docs/tasks/configure-pod-container/security-context/)
for the further details of `supplementalGroupsPolicy`
- [KEP-3619: Fine-grained SupplementalGroups control](https://github.com/kubernetes/enhancements/issues/3619)

View File

@ -1,162 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: From Secrets to Service Accounts: Kubernetes Image Pulls Evolved"
date: 2025-05-07T10:30:00-08:00
slug: kubernetes-v1-33-wi-for-image-pulls
author: >
[Anish Ramasekar](https://github.com/aramase) (Microsoft)
---
Kubernetes has steadily evolved to reduce reliance on long-lived credentials
stored in the API.
A prime example of this shift is the transition of Kubernetes Service Account (KSA) tokens
from long-lived, static tokens to ephemeral, automatically rotated tokens
with OpenID Connect (OIDC)-compliant semantics.
This advancement enables workloads to securely authenticate with external services
without needing persistent secrets.
However, one major gap remains: **image pull authentication**.
Today, Kubernetes clusters rely on image pull secrets stored in the API,
which are long-lived and difficult to rotate,
or on node-level kubelet credential providers,
which allow any pod running on a node to access the same credentials.
This presents security and operational challenges.
To address this, Kubernetes is introducing **Service Account Token Integration
for Kubelet Credential Providers**, now available in **alpha**.
This enhancement allows credential providers to use pod-specific service account tokens
to obtain registry credentials, which kubelet can then use for image pulls —
eliminating the need for long-lived image pull secrets.
## The problem with image pull secrets
Currently, Kubernetes administrators have two primary options
for handling private container image pulls:
1. **Image pull secrets stored in the Kubernetes API**
- These secrets are often long-lived because they are hard to rotate.
- They must be explicitly attached to a service account or pod.
- Compromise of a pull secret can lead to unauthorized image access.
2. **Kubelet credential providers**
- These providers fetch credentials dynamically at the node level.
- Any pod running on the node can access the same credentials.
- Theres no per-workload isolation, increasing security risks.
Neither approach aligns with the principles of **least privilege**
or **ephemeral authentication**, leaving Kubernetes with a security gap.
## The solution: Service Account token integration for Kubelet credential providers
This new enhancement enables kubelet credential providers
to use **workload identity** when fetching image registry credentials.
Instead of relying on long-lived secrets, credential providers can use
service account tokens to request short-lived credentials
tied to a specific pods identity.
This approach provides:
- **Workload-specific authentication**:
Image pull credentials are scoped to a particular workload.
- **Ephemeral credentials**:
Tokens are automatically rotated, eliminating the risks of long-lived secrets.
- **Seamless integration**:
Works with existing Kubernetes authentication mechanisms,
aligning with cloud-native security best practices.
## How it works
### 1. Service Account tokens for credential providers
Kubelet generates **short-lived, automatically rotated** tokens for service accounts
if the credential provider it communicates with has opted into receiving
a service account token for image pulls.
These tokens conform to OIDC ID token semantics
and are provided to the credential provider
as part of the `CredentialProviderRequest`.
The credential provider can then use this token
to authenticate with an external service.
### 2. Image registry authentication flow
- When a pod starts, the kubelet requests credentials from a **credential provider**.
- If the credential provider has opted in,
the kubelet generates a **service account token** for the pod.
- The **service account token is included in the `CredentialProviderRequest`**,
allowing the credential provider to authenticate
and exchange it for **temporary image pull credentials**
from a registry (e.g. AWS ECR, GCP Artifact Registry, Azure ACR).
- The kubelet then uses these credentials
to pull images on behalf of the pod.
## Benefits of this approach
- **Security**:
Eliminates long-lived image pull secrets, reducing attack surfaces.
- **Granular Access Control**:
Credentials are tied to individual workloads rather than entire nodes or clusters.
- **Operational Simplicity**:
No need for administrators to manage and rotate image pull secrets manually.
- **Improved Compliance**:
Helps organizations meet security policies
that prohibit persistent credentials in the cluster.
## What's next?
For Kubernetes **v1.34**, we expect to ship this feature in **beta**
while continuing to gather feedback from users.
In the coming releases, we will focus on:
- Implementing **caching mechanisms**
to improve performance for token generation.
- Giving more **flexibility to credential providers**
to decide how the registry credentials returned to the kubelet are cached.
- Making the feature work with
[Ensure Secret Pulled Images](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2535-ensure-secret-pulled-images)
to ensure pods that use an image
are authorized to access that image
when service account tokens are used for authentication.
You can learn more about this feature
on the [service account token for image pulls](/docs/tasks/administer-cluster/kubelet-credential-provider/#service-account-token-for-image-pulls)
page in the Kubernetes documentation.
You can also follow along on the
[KEP-4412](https://kep.k8s.io/4412)
to track progress across the coming Kubernetes releases.
## Try it out
To try out this feature:
1. **Ensure you are running Kubernetes v1.33 or later**.
2. **Enable the `ServiceAccountTokenForKubeletCredentialProviders` feature gate**
on the kubelet.
3. **Ensure credential provider support**:
Modify or update your credential provider
to use service account tokens for authentication.
4. **Update the credential provider configuration**
to opt into receiving service account tokens
for the credential provider by configuring the `tokenAttributes` field.
5. **Deploy a pod**
that uses the credential provider to pull images from a private registry.
We would love to hear your feedback on this feature.
Please reach out to us on the
[#sig-auth-authenticators-dev](https://kubernetes.slack.com/archives/C04UMAUC4UA)
channel on Kubernetes Slack
(for an invitation, visit [https://slack.k8s.io/](https://slack.k8s.io/)).
## How to get involved
If you are interested in getting involved
in the development of this feature,
sharing feedback, or participating in any other ongoing **SIG Auth** projects,
please reach out on the
[#sig-auth](https://kubernetes.slack.com/archives/C0EN96KUY)
channel on Kubernetes Slack.
You are also welcome to join the bi-weekly
[SIG Auth meetings](https://github.com/kubernetes/community/blob/master/sig-auth/README.md#meetings),
held every other Wednesday.

View File

@ -1,79 +0,0 @@
---
layout: blog
title: "Kubernetes 1.33: Volume Populators Graduate to GA"
date: 2025-05-08T10:30:00-08:00
slug: kubernetes-v1-33-volume-populators-ga
author: >
Danna Wang (Google)
Sunny Song (Google)
---
Kubernetes _volume populators_ are now generally available (GA)! The `AnyVolumeDataSource` feature
gate is treated as always enabled for Kubernetes v1.33, which means that users can specify any appropriate
[custom resource](/docs/concepts/extend-kubernetes/api-extension/custom-resources/#custom-resources)
as the data source of a PersistentVolumeClaim (PVC).
An example of how to use dataSourceRef in PVC:
```yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc1
spec:
...
dataSourceRef:
apiGroup: provider.example.com
kind: Provider
name: provider1
```
## What is new
There are four major enhancements from beta.
### Populator Pod is optional
During the beta phase, contributors to Kubernetes identified potential resource leaks with PersistentVolumeClaim (PVC) deletion while volume population was in progress; these leaks happened due to limitations in finalizer handling.
Ahead of the graduation to general availability, the Kubernetes project added support to delete temporary resources (PVC prime, etc.) if the original PVC is deleted.
To accommodate this, we've introduced three new plugin-based functions:
* `PopulateFn()`: Executes the provider-specific data population logic.
* `PopulateCompleteFn()`: Checks if the data population operation has finished successfully.
* `PopulateCleanupFn()`: Cleans up temporary resources created by the provider-specific functions after data population is completed
A provider example is added in [lib-volume-populator/example](https://github.com/kubernetes-csi/lib-volume-populator/tree/master/example).
### Mutator functions to modify the Kubernetes resources
For GA, the CSI volume populator controller code gained a `MutatorConfig`, allowing the specification of mutator functions to modify Kubernetes resources.
For example, if the PVC prime is not an exact copy of the PVC and you need provider-specific information for the driver, you can include this information in the optional `MutatorConfig`.
This allows you to customize the Kubernetes objects in the volume populator.
### Flexible metric handling for providers
Our beta phase highlighted a new requirement: the need to aggregate metrics not just from lib-volume-populator, but also from other components within the provider's codebase.
To address this, SIG Storage introduced a [provider metric manager](https://github.com/kubernetes-csi/lib-volume-populator/blob/8a922a5302fdba13a6c27328ee50e5396940214b/populator-machinery/controller.go#L122).
This enhancement delegates the implementation of metrics logic to the provider itself, rather than relying solely on lib-volume-populator.
This shift provides greater flexibility and control over metrics collection and aggregation, enabling a more comprehensive view of provider performance.
### Clean up for temporary resources
During the beta phase, we identified potential resource leaks with PersistentVolumeClaim (PVC) deletion while volume population was in progress, due to limitations in finalizer handling. We have improved the populator to support the deletion of temporary resources (PVC prime, etc.) if the original PVC is deleted in this GA release.
## How to use it
To try it out, please follow the [steps](/blog/2022/05/16/volume-populators-beta/#trying-it-out) in the previous beta blog.
## Future directions and potential feature requests
For next step, there are several potential feature requests for volume populator:
* Multi sync: the current implementation is a one-time unidirectional sync from source to destination. This can be extended to support multiple syncs, enabling periodic syncs or allowing users to sync on demand
* Bidirectional sync: an extension of multi sync above, but making it bidirectional between source and destination
* Populate data with priorities: with a list of different dataSourceRef, populate based on priorities
* Populate data from multiple sources of the same provider: populate multiple different sources to one destination
* Populate data from multiple sources of the different providers: populate multiple different sources to one destination, pipelining different resources population
To ensure we're building something truly valuable, Kubernetes SIG Storage would love to hear about any specific use cases you have in mind for this feature.
For any inquiries or specific questions related to volume populator, please reach out to the [SIG Storage community](https://github.com/kubernetes/community/tree/master/sig-storage).

View File

@ -1,53 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Streaming List responses"
date: 2025-05-09T10:30:00-08:00
slug: kubernetes-v1-33-streaming-list-responses
author: >
Marek Siarkowicz (Google),
Wei Fu (Microsoft)
---
Managing Kubernetes cluster stability becomes increasingly critical as your infrastructure grows. One of the most challenging aspects of operating large-scale clusters has been handling List requests that fetch substantial datasets - a common operation that could unexpectedly impact your cluster's stability.
Today, the Kubernetes community is excited to announce a significant architectural improvement: streaming encoding for List responses.
## The problem: unnecessary memory consumption with large resources
Current API response encoders just serialize an entire response into a single contiguous memory and perform one [ResponseWriter.Write](https://pkg.go.dev/net/http#ResponseWriter.Write) call to transmit data to the client. Despite HTTP/2's capability to split responses into smaller frames for transmission, the underlying HTTP server continues to hold the complete response data as a single buffer. Even as individual frames are transmitted to the client, the memory associated with these frames cannot be freed incrementally.
When cluster size grows, the single response body can be substantial - like hundreds of megabytes in size. At large scale, the current approach becomes particularly inefficient, as it prevents incremental memory release during transmission. Imagining that when network congestion occurs, that large response bodys memory block stays active for tens of seconds or even minutes. This limitation leads to unnecessarily high and prolonged memory consumption in the kube-apiserver process. If multiple large List requests occur simultaneously, the cumulative memory consumption can escalate rapidly, potentially leading to an Out-of-Memory (OOM) situation that compromises cluster stability.
The encoding/json package uses sync.Pool to reuse memory buffers during serialization. While efficient for consistent workloads, this mechanism creates challenges with sporadic large List responses. When processing these large responses, memory pools expand significantly. But due to sync.Pool's design, these oversized buffers remain reserved after use. Subsequent small List requests continue utilizing these large memory allocations, preventing garbage collection and maintaining persistently high memory consumption in the kube-apiserver even after the initial large responses complete.
Additionally, [Protocol Buffers](https://github.com/protocolbuffers/protocolbuffers.github.io/blob/c14731f55296f8c6367faa4f2e55a3d3594544c6/content/programming-guides/techniques.md?plain=1#L39) are not designed to handle large datasets. But its great for handling **individual** messages within a large data set. This highlights the need for streaming-based approaches that can process and transmit large collections incrementally rather than as monolithic blocks.
> _As a general rule of thumb, if you are dealing in messages larger than a megabyte each, it may be time to consider an alternate strategy._
>
> _From https://protobuf.dev/programming-guides/techniques/_
## Streaming encoder for List responses
The streaming encoding mechanism is specifically designed for List responses, leveraging their common well-defined collection structures. The core idea focuses exclusively on the **Items** field within collection structures, which represents the bulk of memory consumption in large responses. Rather than encoding the entire **Items** array as one contiguous memory block, the new streaming encoder processes and transmits each item individually, allowing memory to be freed progressively as frame or chunk is transmitted. As a result, encoding items one by one significantly reduces the memory footprint required by the API server.
With Kubernetes objects typically limited to 1.5 MiB (from ETCD), streaming encoding keeps memory consumption predictable and manageable regardless of how many objects are in a List response. The result is significantly improved API server stability, reduced memory spikes, and better overall cluster performance - especially in environments where multiple large List operations might occur simultaneously.
To ensure perfect backward compatibility, the streaming encoder validates Go struct tags rigorously before activation, guaranteeing byte-for-byte consistency with the original encoder. Standard encoding mechanisms process all fields except **Items**, maintaining identical output formatting throughout. This approach seamlessly supports all Kubernetes List types—from built-in **\*List** objects to Custom Resource **UnstructuredList** objects - requiring zero client-side modifications or awareness that the underlying encoding method has changed.
## Performance gains you'll notice
* **Reduced Memory Consumption:** Significantly lowers the memory footprint of the API server when handling large **list** requests,
especially when dealing with **large resources**.
* **Improved Scalability:** Enables the API server to handle more concurrent requests and larger datasets without running out of memory.
* **Increased Stability:** Reduces the risk of OOM kills and service disruptions.
* **Efficient Resource Utilization:** Optimizes memory usage and improves overall resource efficiency.
## Benchmark results
To validate results Kubernetes has introduced a new **list** benchmark which executes concurrently 10 **list** requests each returning 1GB of data.
The benchmark has showed 20x improvement, reducing memory usage from 70-80GB to 3GB.
{{< figure src="results.png" alt="Screenshot of a K8s performance dashboard showing memory usage for benchmark list going down from 60GB to 3GB" caption="List benchmark memory usage" >}}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 32 KiB

View File

@ -1,136 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Image Pull Policy the way you always thought it worked!"
date: 2025-05-12T10:30:00-08:00
slug: kubernetes-v1-33-ensure-secret-pulled-images-alpha
author: >
[Ben Petersen](https://github.com/benjaminapetersen) (Microsoft),
[Stanislav Láznička](https://github.com/stlaz) (Microsoft)
---
## Image Pull Policy the way you always thought it worked!
Some things in Kubernetes are surprising, and the way `imagePullPolicy` behaves might
be one of them. Given Kubernetes is all about running pods, it may be peculiar
to learn that there has been a caveat to restricting pod access to authenticated images for
over 10 years in the form of [issue 18787](https://github.com/kubernetes/kubernetes/issues/18787)!
It is an exciting release when you can resolve a ten-year-old issue.
{{< note >}}
Throughout this blog post, the term "pod credentials" will be used often. In this context,
the term generally encapsulates the authentication material that is available to a pod
to authenticate a container image pull.
{{< /note >}}
## IfNotPresent, even if I'm not supposed to have it
The gist of the problem is that the `imagePullPolicy: IfNotPresent` strategy has done
precisely what it says, and nothing more. Let's set up a scenario. To begin, *Pod A* in *Namespace X* is scheduled to *Node 1* and requires *image Foo* from a private repository.
For it's image pull authentication material, the pod references *Secret 1* in its `imagePullSecrets`. *Secret 1* contains the necessary credentials to pull from the private repository. The Kubelet will utilize the credentials from *Secret 1* as supplied by *Pod A*
and it will pull *container image Foo* from the registry. This is the intended (and secure)
behavior.
But now things get curious. If *Pod B* in *Namespace Y* happens to also be scheduled to *Node 1*, unexpected (and potentially insecure) things happen. *Pod B* may reference the same private image, specifying the `IfNotPresent` image pull policy. *Pod B* does not reference *Secret 1*
(or in our case, any secret) in its `imagePullSecrets`. When the Kubelet tries to run the pod, it honors the `IfNotPresent` policy. The Kubelet sees that the *image Foo* is already present locally, and will provide *image Foo* to *Pod B*. *Pod B* gets to run the image even though it did not provide credentials authorizing it to pull the image in the first place.
{{< figure
src="ensure_secret_image_pulls.svg"
caption="Using a private image pulled by a different pod"
alt="Illustration of the process of two pods trying to access a private image, the first one with a pull secret, the second one without it"
>}}
While `IfNotPresent` should not pull *image Foo* if it is already present
on the node, it is an incorrect security posture to allow all pods scheduled
to a node to have access to previously pulled private image. These pods were never
authorized to pull the image in the first place.
## IfNotPresent, but only if I am supposed to have it
In Kubernetes v1.33, we - SIG Auth and SIG Node - have finally started to address this (really old) problem and getting the verification right! The basic expected behavior is not changed. If
an image is not present, the Kubelet will attempt to pull the image. The credentials each pod supplies will be utilized for this task. This matches behavior prior to 1.33.
If the image is present, then the behavior of the Kubelet changes. The Kubelet will now
verify the pod's credentials before allowing the pod to use the image.
Performance and service stability have been a consideration while revising the feature.
Pods utilizing the same credential will not be required to re-authenticate. This is
also true when pods source credentials from the same Kubernetes Secret object, even
when the credentials are rotated.
## Never pull, but use if authorized
The `imagePullPolicy: Never` option does not fetch images. However, if the
container image is already present on the node, any pod attempting to use the private
image will be required to provide credentials, and those credentials require verification.
Pods utilizing the same credential will not be required to re-authenticate.
Pods that do not supply credentials previously used to successfully pull an
image will not be allowed to use the private image.
## Always pull, if authorized
The `imagePullPolicy: Always` has always worked as intended. Each time an image
is requested, the request goes to the registry and the registry will perform an authentication
check.
In the past, forcing the `Always` image pull policy via pod admission was the only way to ensure
that your private container images didn't get reused by other pods on nodes which already pulled the images.
Fortunately, this was somewhat performant. Only the image manifest was pulled, not the image. However, there was still a cost and a risk. During a new rollout, scale up, or pod restart, the image registry that provided the image MUST be available for the auth check, putting the image registry in the critical path for stability of services running inside of the cluster.
## How it all works
The feature is based on persistent, file-based caches that are present on each of
the nodes. The following is a simplified description of how the feature works.
For the complete version, please see [KEP-2535](https://kep.k8s.io/2535).
The process of requesting an image for the first time goes like this:
1. A pod requesting an image from a private registry is scheduled to a node.
1. The image is not present on the node.
1. The Kubelet makes a record of the intention to pull the image.
1. The Kubelet extracts credentials from the Kubernetes Secret referenced by the pod
as an image pull secret, and uses them to pull the image from the private registry.
1. After the image has been successfully pulled, the Kubelet makes a record of
the successful pull. This record includes details about credentials used
(in the form of a hash) as well as the Secret from which they originated.
1. The Kubelet removes the original record of intent.
1. The Kubelet retains the record of successful pull for later use.
When future pods scheduled to the same node request the previously pulled private image:
1. The Kubelet checks the credentials that the new pod provides for the pull.
1. If the hash of these credentials, or the source Secret of the credentials match
the hash or source Secret which were recorded for a previous successful pull,
the pod is allowed to use the previously pulled image.
1. If the credentials or their source Secret are not found in the records of
successful pulls for that image, the Kubelet will attempt to use
these new credentials to request a pull from the remote registry, triggering
the authorization flow.
## Try it out
In Kubernetes v1.33 we shipped the alpha version of this feature. To give it a spin,
enable the `KubeletEnsureSecretPulledImages` feature gate for your 1.33 Kubelets.
You can learn more about the feature and additional optional configuration on the
[concept page for Images](/docs/concepts/containers/images/#ensureimagepullcredentialverification)
in the official Kubernetes documentation.
## What's next?
In future releases we are going to:
1. Make this feature work together with [Projected service account tokens for Kubelet image credential providers](https://kep.k8s.io/4412) which adds a new, workload-specific source of image pull credentials.
1. Write a benchmarking suite to measure the performance of this feature and assess the impact of
any future changes.
1. Implement an in-memory caching layer so that we don't need to read files for each image
pull request.
1. Add support for credential expirations, thus forcing previously validated credentials to
be re-authenticated.
## How to get involved
[Reading KEP-2535](https://kep.k8s.io/2535) is a great way to understand these changes in depth.
If you are interested in further involvement, reach out to us on the [#sig-auth-authenticators-dev](https://kubernetes.slack.com/archives/C04UMAUC4UA) channel
on Kubernetes Slack (for an invitation, visit [https://slack.k8s.io/](https://slack.k8s.io/)).
You are also welcome to join the bi-weekly [SIG Auth meetings](https://github.com/kubernetes/community/blob/master/sig-auth/README.md#meetings),
held every other Wednesday.

View File

@ -1,107 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Job's Backoff Limit Per Index Goes GA"
date: 2025-05-13T10:30:00-08:00
slug: kubernetes-v1-33-jobs-backoff-limit-per-index-goes-ga
author: >
[Michał Woźniak](https://github.com/mimowo) (Google)
---
In Kubernetes v1.33, the _Backoff Limit Per Index_ feature reaches general
availability (GA). This blog describes the Backoff Limit Per Index feature and
its benefits.
## About backoff limit per index
When you run workloads on Kubernetes, you must consider scenarios where Pod
failures can affect the completion of your workloads. Ideally, your workload
should tolerate transient failures and continue running.
To achieve failure tolerance in a Kubernetes Job, you can set the
`spec.backoffLimit` field. This field specifies the total number of tolerated
failures.
However, for workloads where every index is considered independent, like
[embarassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel)
workloads - the `spec.backoffLimit` field is often not flexible enough.
For example, you may choose to run multiple suites of integration tests by
representing each suite as an index within an [Indexed Job](/docs/tasks/job/indexed-parallel-processing-static/).
In that setup, a fast-failing index (test suite) is likely to consume your
entire budget for tolerating Pod failures, and you might not be able to run the
other indexes.
In order to address this limitation, Kubernetes introduced _backoff limit per index_,
which allows you to control the number of retries per index.
## How backoff limit per index works
To use Backoff Limit Per Index for Indexed Jobs, specify the number of tolerated
Pod failures per index with the `spec.backoffLimitPerIndex` field. When you set
this field, the Job executes all indexes by default.
Additionally, to fine-tune the error handling:
* Specify the cap on the total number of failed indexes by setting the
`spec.maxFailedIndexes` field. When the limit is exceeded the entire Job is
terminated.
* Define a short-circuit to detect a failed index by using the `FailIndex` action in the
[Pod Failure Policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
mechanism.
When the number of tolerated failures is exceeded, the Job marks that index as
failed and lists it in the Job's `status.failedIndexes` field.
### Example
The following Job spec snippet is an example of how to combine backoff limit per
index with the _Pod Failure Policy_ feature:
```yaml
completions: 10
parallelism: 10
completionMode: Indexed
backoffLimitPerIndex: 1
maxFailedIndexes: 5
podFailurePolicy:
rules:
- action: Ignore
onPodConditions:
- type: DisruptionTarget
- action: FailIndex
onExitCodes:
operator: In
values: [ 42 ]
```
In this example, the Job handles Pod failures as follows:
- Ignores any failed Pods that have the built-in
[disruption condition](/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions),
called `DisruptionTarget`. These Pods don't count towards Job backoff limits.
- Fails the index corresponding to the failed Pod if any of the failed Pod's
containers finished with the exit code 42 - based on the matching "FailIndex"
rule.
- Retries the first failure of any index, unless the index failed due to the
matching `FailIndex` rule.
- Fails the entire Job if the number of failed indexes exceeded 5 (set by the
`spec.maxFailedIndexes` field).
## Learn more
- Read the blog post on the closely related feature of Pod Failure Policy [Kubernetes 1.31: Pod Failure Policy for Jobs Goes GA](/blog/2024/08/19/kubernetes-1-31-pod-failure-policy-for-jobs-goes-ga/)
- For a hands-on guide to using Pod failure policy, including the use of FailIndex, see
[Handling retriable and non-retriable pod failures with Pod failure policy](/docs/tasks/job/pod-failure-policy/)
- Read the documentation for
[Backoff limit per index](/docs/concepts/workloads/controllers/job/#backoff-limit-per-index) and
[Pod failure policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
- Read the KEP for the [Backoff Limits Per Index For Indexed Jobs](https://github.com/kubernetes/enhancements/tree/master/keps/sig-apps/3850-backoff-limits-per-index-for-indexed-jobs)
## Get involved
This work was sponsored by the Kubernetes
[batch working group](https://github.com/kubernetes/community/tree/master/wg-batch)
in close collaboration with the
[SIG Apps](https://github.com/kubernetes/community/tree/master/sig-apps) community.
If you are interested in working on new features in the space we recommend
subscribing to our [Slack](https://kubernetes.slack.com/messages/wg-batch)
channel and attending the regular community meetings.

View File

@ -1,77 +0,0 @@
---
layout: blog
title: "Kubernetes v1.33: Updates to Container Lifecycle"
date: 2025-05-14T10:30:00-08:00
slug: kubernetes-v1-33-updates-to-container-lifecycle
author: >
Sreeram Venkitesh (DigitalOcean)
---
Kubernetes v1.33 introduces a few updates to the lifecycle of containers. The Sleep action for container lifecycle hooks now supports a zero sleep duration (feature enabled by default).
There is also alpha support for customizing the stop signal sent to containers when they are being terminated.
This blog post goes into the details of these new aspects of the container lifecycle, and how you can use them.
## Zero value for Sleep action
Kubernetes v1.29 introduced the `Sleep` action for container PreStop and PostStart Lifecycle hooks. The Sleep action lets your containers pause for a specified duration after the container is started or before it is terminated. This was needed to provide a straightforward way to manage graceful shutdowns. Before the Sleep action, folks used to run the `sleep` command using the exec action in their container lifecycle hooks. If you wanted to do this you'd need to have the binary for the `sleep` command in your container image. This is difficult if you're using third party images.
The sleep action when it was added initially didn't have support for a sleep duration of zero seconds. The `time.Sleep` which the Sleep action uses under the hood supports a duration of zero seconds. Using a negative or a zero value for the sleep returns immediately, resulting in a no-op. We wanted the same behaviour with the sleep action. This support for the zero duration was later added in v1.32, with the `PodLifecycleSleepActionAllowZero` feature gate.
The `PodLifecycleSleepActionAllowZero` feature gate has graduated to beta in v1.33, and is now enabled by default.
The original Sleep action for `preStop` and `postStart` hooks is been enabled by default, starting from Kubernetes v1.30.
With a cluster running Kubernetes v1.33, you are able to set a
zero duration for sleep lifecycle hooks. For a cluster with default configuration, you don't need
to enable any feature gate to make that possible.
## Container stop signals
Container runtimes such as containerd and CRI-O honor a `StopSignal` instruction in the container image definition. This can be used to specify a custom stop signal
that the runtime will used to terminate containers based on that image.
Stop signal configuration was not originally part of the Pod API in Kubernetes.
Until Kubernetes v1.33, the only way to override the stop signal for containers was by rebuilding your container image with the new custom stop signal
(for example, specifying `STOPSIGNAL` in a `Containerfile` or `Dockerfile`).
The `ContainerStopSignals` feature gate which is newly added in Kubernetes v1.33 adds stop signals to the Kubernetes API. This allows users to specify a custom stop signal in the container spec. Stop signals are added to the API as a new lifecycle along with the existing PreStop and PostStart lifecycle handlers. In order to use this feature, we expect the Pod to have the operating system specified with `spec.os.name`. This is enforced so that we can cross-validate the stop signal against the operating system and make sure that the containers in the Pod are created with a valid stop signal for the operating system the Pod is being scheduled to. For Pods scheduled on Windows nodes, only `SIGTERM` and `SIGKILL` are allowed as valid stop signals. Find the full list of signals supported in Linux nodes [here](https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/api/core/v1/types.go#L2985-L3053).
### Default behaviour
If a container has a custom stop signal defined in its lifecycle, the container runtime would use the signal defined in the lifecycle to kill the container, given that the container runtime also supports custom stop signals. If there is no custom stop signal defined in the container lifecycle, the runtime would fallback to the stop signal defined in the container image. If there is no stop signal defined in the container image, the default stop signal of the runtime would be used. The default signal is `SIGTERM` for both containerd and CRI-O.
### Version skew
For the feature to work as intended, both the versions of Kubernetes and the container runtime should support container stop signals. The changes to the Kuberentes API and kubelet are available in alpha stage from v1.33, which can be enabled with the `ContainerStopSignals` feature gate. The container runtime implementations for containerd and CRI-O are still a work in progress and will be rolled out soon.
### Using container stop signals
To enable this feature, you need to turn on the `ContainerStopSignals` feature gate in both the kube-apiserver and the kubelet. Once you have nodes where the feature gate is turned on, you can create Pods with a StopSignal lifecycle and a valid OS name like so:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: nginx
spec:
os:
name: linux
containers:
- name: nginx
image: nginx:latest
lifecycle:
stopSignal: SIGUSR1
```
Do note that the `SIGUSR1` signal in this example can only be used if the container's Pod is scheduled to a Linux node. Hence we need to specify `spec.os.name` as `linux` to be able to use the signal. You will only be able to configure `SIGTERM` and `SIGKILL` signals if the Pod is being scheduled to a Windows node. You cannot specify a `containers[*].lifecycle.stopSignal` if the `spec.os.name` field is nil or unset either.
## How do I get involved?
This feature is driven by the [SIG Node](https://github.com/Kubernetes/community/blob/master/sig-node/README.md). If you are interested in helping develop this feature, sharing feedback, or participating in any other ongoing SIG Node projects, please reach out to us!
You can reach SIG Node by several means:
- Slack: [#sig-node](https://kubernetes.slack.com/messages/sig-node)
- [Mailing list](https://groups.google.com/forum/#!forum/kubernetes-sig-node)
- [Open Community Issues/PRs](https://github.com/kubernetes/community/labels/sig%2Fnode)
You can also contact me directly:
- GitHub: @sreeram-venkitesh
- Slack: @sreeram.venkitesh

Binary file not shown.

Before

Width:  |  Height:  |  Size: 166 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

View File

@ -1,395 +0,0 @@
---
layout: blog
title: "Announcing etcd v3.6.0"
date: 2025-05-15T16:00:00-08:00
slug: announcing-etcd-3.6
author: >
Benjamin Wang (VMware by Broadcom)
canonicalUrl: "https://etcd.io/blog/2025/announcing-etcd-3.6/"
---
_This announcement originally [appeared](https://etcd.io/blog/2025/announcing-etcd-3.6/) on the etcd blog._
Today, we are releasing [etcd v3.6.0][], the first minor release since etcd v3.5.0 on June 15, 2021. This release
introduces several new features, makes significant progress on long-standing efforts like downgrade support and
migration to v3store, and addresses numerous critical & major issues. It also includes major optimizations in
memory usage, improving efficiency and performance.
In addition to the features of v3.6.0, etcd has joined Kubernetes as a SIG (sig-etcd), enabling us to improve
project sustainability. We've introduced systematic robustness testing to ensure correctness and reliability.
Through the etcd-operator Working Group, we plan to improve usability as well.
What follows are the most significant changes introduced in etcd v3.6.0, along with the discussion of the
roadmap for future development. For a detailed list of changes, please refer to the [CHANGELOG-3.6][].
A heartfelt thank you to all the contributors who made this release possible!
## Security
etcd takes security seriously. To enhance software security in v3.6.0, we have improved our workflow checks by
integrating `govulncheck` to scan the source code and `trivy` to scan container images. These improvements
have also been backported to supported stable releases.
etcd continues to follow the [Security Release Process][] to ensure vulnerabilities are properly managed and addressed.
## Features
### Migration to v3store
The v2store has been deprecated since etcd v3.4 but could still be enabled via `--enable-v2`. It remained the source of
truth for membership data. In etcd v3.6.0, v2store can no longer be enabled as the `--enable-v2` flag has been removed,
and v3store has become the sole source of truth for membership data.
While v2store still exists in v3.6.0, etcd will fail to start if it contains any data other than membership information.
To assist with migration, etcd v3.5.18+ provides the `etcdutl check v2store` command, which verifies that v2store
contains only membership data (see [PR 19113][]).
Compared to v2store, v3store offers better performance and transactional support. It is also the actively maintained
storage engine moving forward.
The removal of v2store is still ongoing and is tracked in [issues/12913][].
### Downgrade
etcd v3.6.0 is the first version to fully support downgrade. The effort for this downgrade task spans
both versions 3.5 and 3.6, and all related work is tracked in [issues/11716][].
At a high level, the process involves migrating the data schema to the target version (e.g., v3.5),
followed by a rolling downgrade.
Ensure the cluster is healthy, and take a snapshot backup. Validate whether the downgrade is valid:
```bash
$ etcdctl downgrade validate 3.5
Downgrade validate success, cluster version 3.6
```
If the downgrade is valid, enable downgrade mode:
```bash
$ etcdctl downgrade enable 3.5
Downgrade enable success, cluster version 3.6
```
etcd will then migrate the data schema in the background. Once complete, proceed with the rolling downgrade.
For details, refer to the [Downgrade-3.6] guide.
### Feature gates
In etcd v3.6.0, we introduced Kubernetes-style feature gates for managing new features. Previously, we
indicated unstable features through the `--experimental` prefix in feature flag names. The prefix was removed
once the feature was stable, causing a breaking change. Now, features will start in Alpha, progress
to Beta, then GA, or get deprecated. This ensures a much smoother upgrade and downgrade experience for users.
See [feature-gates][] for details.
### livez / readyz checks {#livezreadyz-checks}
etcd now supports `/livez` and `/readyz` endpoints, aligning with Kubernetes' Liveness and Readiness probes.
`/livez` indicates whether the etcd instance is alive, while `/readyz` indicates when it is ready to serve requests.
This feature has also been backported to release-3.5 (starting from v3.5.11) and release-3.4 (starting from v3.4.29).
See [livez/readyz][] for details.
The existing `/health` endpoint remains functional. `/livez` is similar to `/health?serializable=true`, while
`/readyz` is similar to `/health` or `/health?serializable=false`. Clearly, the `/livez` and `/readyz`
endpoints provide clearer semantics and are easier to understand.
### v3discovery
In etcd v3.6.0, the new discovery protocol [v3discovery][] was introduced, based on clientv3.
It facilitates the discovery of all cluster members during the bootstrap phase.
The previous [v2discovery][] protocol, based on clientv2, has been deprecated. Additionally,
the public discovery service at <https://discovery.etcd.io/>, which relied on v2discovery, is no longer maintained.
## Performance
### Memory
In this release, we reduced average memory consumption by at least 50% (see Figure 1). This improvement is primarily due to two changes:
- The default value of `--snapshot-count` has been reduced from 100,000 in v3.5 to 10,000 in v3.6. As a result, etcd v3.6 now retains only about 10% of the history records compared to v3.5.
- Raft history is compacted more frequently, as introduced in [PR/18825][].
{{< figure src="figure-1.png" alt="Diagram of memory usage" >}}
_**Figure 1:** Memory usage comparison between etcd v3.5.20 and v3.6.0-rc.2 under different read/write ratios.
Each subplot shows the memory usage over time with a specific read/write ratio. The red line represents etcd
v3.5.20, while the teal line represents v3.6.0-rc.2. Across all tested ratios, v3.6.0-rc.2 exhibits lower and
more stable memory usage._
### Throughput
Compared to v3.5, etcd v3.6 delivers an average performance improvement of approximately 10%
in both read and write throughput (see Figure 2, 3, 4 and 5). This improvement is not attributed to
any single major change, but rather the cumulative effect of multiple minor enhancements. One such
example is the optimization of the free page queries introduced in [PR/419][].
{{< figure src="figure-2.png" alt="etcd read transaction performance with a high write ratio" >}}
_**Figure 2:** Read throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high write ratio. The
read/write ratio is 0.0078, meaning 1 read per 128 writes. The right bar shows the percentage improvement
in read throughput of v3.6.0-rc.2 over v3.5.20, ranging from 3.21% to 25.59%._
{{< figure src="figure-3.png" alt="etcd read transaction performance with a high read ratio" >}}
_**Figure 3:** Read throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high read ratio.
The read/write ratio is 8, meaning 8 reads per write. The right bar shows the percentage improvement in
read throughput of v3.6.0-rc.2 over v3.5.20, ranging from 4.38% to 27.20%._
{{< figure src="figure-4.png" alt="etcd write transaction performance with a high write ratio" >}}
_**Figure 4:** Write throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high write ratio. The
read/write ratio is 0.0078, meaning 1 read per 128 writes. The right bar shows the percentage improvement
in write throughput of v3.6.0-rc.2 over v3.5.20, ranging from 2.95% to 24.24%._
{{< figure src="figure-5.png" alt="etcd write transaction performance with a high read ratio" >}}
_**Figure 5:** Write throughput comparison between etcd v3.5.20 and v3.6.0-rc.2 under a high read ratio.
The read/write ratio is 8, meaning 8 reads per write. The right bar shows the percentage improvement in
write throughput of v3.6.0-rc.2 over v3.5.20, ranging from 3.86% to 28.37%._
## Breaking changes
This section highlights a few notable breaking changes. For a complete list, please refer to
the [Upgrade etcd from v3.5 to v3.6][] and the [CHANGELOG-3.6][].
Old binaries are incompatible with new schema versions
Old etcd binaries are not compatible with newer data schema versions. For example, etcd 3.5 cannot start with
data created by etcd 3.6, and etcd 3.4 cannot start with data created by either 3.5 or 3.6.
When downgrading etcd, it's important to follow the documented downgrade procedure. Simply replacing
the binary or image will result in the incompatibility issue.
### Peer endpoints no longer serve client requests
Client endpoints (`--advertise-client-urls`) are intended to serve client requests only, while peer
endpoints (`--initial-advertise-peer-urls`) are intended solely for peer communication. However, due to an implementation
oversight, the peer endpoints were also able to handle client requests in etcd 3.4 and 3.5. This behavior was misleading and
encouraged incorrect usage patterns. In etcd 3.6, this misleading behavior was corrected via [PR/13565][]; peer endpoints
no longer serve client requests.
### Clear boundary between etcdctl and etcdutl
Both `etcdctl` and `etcdutl` are command line tools. `etcdutl` is an offline utility designed to operate directly on
etcd data files, while `etcdctl` is an online tool that interacts with etcd over a network. Previously, there were some
overlapping functionalities between the two, but these overlaps were removed in 3.6.0.
- Removed `etcdctl defrag --data-dir`
The `etcdctl defrag` command only support online defragmentation and no longer supports offline defragmentation.
To perform offline defragmentation, use the `etcdutl defrag --data-dir` command instead.
- Removed `etcdctl snapshot status`
`etcdctl` no longer supports retrieving the status of a snapshot. Use the `etcdutl snapshot status` command instead.
- Removed `etcdctl snapshot restore`
`etcdctl` no longer supports restoring from a snapshot. Use the `etcdutl snapshot restore` command instead.
## Critical bug fixes
Correctness has always been a top priority for the etcd project. In the process of developing 3.6.0, we found and
fixed a few notable bugs that could lead to data inconsistency in specific cases. These fixes have been backported
to previous releases, but we believe they deserve special mention here.
- Data Inconsistency when Crashing Under Load
Previously, when etcd was applying data, it would update the consistent-index first, followed by committing the
data. However, these operations were not atomic. If etcd crashed in between, it could lead to data inconsistency
(see [issue/13766][]). The issue was introduced in v3.5.0, and fixed in v3.5.3 with [PR/13854][].
- Durability API guarantee broken in single node cluster
When a client writes data and receives a success response, the data is expected to be persisted. However, the data might
be lost if etcd crashes immediately after sending the success response to the client. This was a legacy issue (see [issue/14370][])
affecting all previous releases. It was addressed in v3.4.21 and v3.5.5 with [PR/14400][], and fixed in raft side in
main branch (now release-3.6) with [PR/14413][].
- Revision Inconsistency when Crashing During Defragmentation
If etcd crashed during the defragmentation operation, upon restart, it might reapply
some entries which had already been applied, accordingly leading to the revision inconsistency issue
(see the discussions in [PR/14685][]). The issue was introduced in v3.5.0, and fixed in v3.5.6 with [PR/14730][].
## Upgrade issue
This section highlights a common issue [issues/19557][] in the etcd v3.5 to v3.6 upgrade that may cause the upgrade
process to fail. For a complete upgrade guide, refer to [Upgrade etcd from v3.5 to v3.6][].
The issue was introduced in etcd v3.5.1, and resolved in v3.5.20.
**Key takeaway**: users are required to first upgrade to etcd v3.5.20 (or a higher patch version) before upgrading
to etcd v3.6.0; otherwise, the upgrade may fail.
For more background and technical context, see [upgrade_from_3.5_to_3.6_issue][].
## Testing
We introduced the [Robustness testing][] to verify correctness, which has always been our top priority.
It plays traffic of various types and volumes against an etcd cluster, concurrently injects a random
failpoint, records all operations (including both requests and responses), and finally performs a
linearizability check. It also verifies that the [Watch APIs][] guarantees have not been violated.
The robustness test increases our confidence in ensuring the quality of each etcd release.
We have migrated most of the etcd workflow tests to Kubernetes' Prow testing infrastructure to
take advantage of its benefit, such as nice dashboards for viewing test results and the ability
for contributors to rerun failed tests themselves.
## Platforms
While retaining all existing supported platforms, we have promoted Linux/ARM64 to Tier 1 support.
For more details, please refer to [issues/15951][]. For the complete list of supported platforms,
see [supported-platform][].
## Dependencies
### Dependency bumping guide
We have published an official guide on how to bump dependencies for etcds main branch and stable releases.
It also covers how to update the Go version. For more details, please refer to [dependency_management][].
With this guide available, any contributors can now help with dependency upgrades.
### Core Dependency Updates
[bbolt][] and [raft][] are two core dependencies of etcd.
Both etcd v3.4 and v3.5 depend on bbolt v1.3, while etcd v3.6 depends on bbolt v1.4.
For the release-3.4 and release-3.5 branches, raft is included in the etcd repository itself, so etcd v3.4 and v3.5
do not depend on an external raft module. Starting from etcd v3.6, raft was moved to a separate repository ([raft][]),
and the first standalone raft release is v3.6.0. As a result, etcd v3.6.0 depends on raft v3.6.0.
Please see the table below for a summary:
| etcd versions | bbolt versions | raft versions |
|---------------|----------------|---------------|
| 3.4.x | v1.3.x | N/A |
| 3.5.x | v1.3.x | N/A |
| 3.6.x | v1.4.x | v3.6.x |
### grpc-gateway@v2
We upgraded [grpc-gateway][] from v1 to v2 via [PR/16595][] in etcd v3.6.0. This is a major step toward
migrating to [protobuf-go][], the second major version of the Go protocol buffer API implementation.
grpc-gateway@v2 is designed to work with [protobuf-go][]. However, etcd v3.6 still depends on the deprecated
[gogo/protobuf][], which is actually protocol buffer v1 implementation. To resolve this incompatibility,
we applied a [patch][] to the generated *.pb.gw.go files to convert v1 messages to v2 messages.
### grpc-ecosystem/go-grpc-middleware/providers/prometheus
We switched from the deprecated (and archived) [grpc-ecosystem/go-grpc-prometheus][] to
[grpc-ecosystem/go-grpc-middleware/providers/prometheus][] via [PR/19195][]. This change ensures continued
support and access to the latest features and improvements in the gRPC Prometheus integration.
## Community
There are exciting developments in the etcd community that reflect our ongoing commitment
to strengthening collaboration, improving maintainability, and evolving the projects governance.
### etcd Becomes a Kubernetes SIG
etcd has officially become a Kubernetes Special Interest Group: SIG-etcd. This change reflects
etcds critical role as the primary datastore for Kubernetes and establishes a more structured
and transparent home for long-term stewardship and cross-project collaboration. The new SIG
designation will help streamline decision-making, align roadmaps with Kubernetes needs,
and attract broader community involvement.
### New contributors, maintainers, and reviewers
Weve seen increasing engagement from contributors, which has resulted in the addition of three new maintainers:
- [fuweid][]
- [jmhbnz][]
- [wenjiaswe][]
Their continued contributions have been instrumental in driving the project forward.
We also welcome two new reviewers to the project:
- [ivanvc][]
- [siyuanfoundation][]
We appreciate their dedication to code quality and their willingness to take on broader review responsibilities
within the community.
New release team
We've formed a new release team led by [ivanvc][] and [jmhbnz][], streamlining the release process by automating
many previously manual steps. Inspired by Kubernetes SIG Release, we've adopted several best practices, including
clearly defined release team roles and the introduction of release shadows to support knowledge sharing and team
sustainability. These changes have made our releases smoother and more reliable, allowing us to approach each
release with greater confidence and consistency.
### Introducing the etcd Operator Working Group
To further advance etcds operational excellence, we have formed a new working group: [WG-etcd-operator][].
The working group is dedicated to enabling the automatic and efficient operation of etcd clusters that run in
the Kubernetes environment using an etcd-operator.
## Future Development
The legacy v2store has been deprecated since etcd v3.4, and the flag `--enable-v2` was removed entirely in v3.6.
This means that starting from v3.6, there is no longer a way to enable or use the v2store. However, etcd still
bootstraps internally from the legacy v2 snapshots. To address this inconsistency, We plan to change etcd to
bootstrap from the v3store and replay the WAL entries based on the `consistent-index`. The work is being tracked
in [issues/12913].
One of the most persistent challenges remains the large range of queries from the kube-apiserver, which can
lead to process crashes due to their unpredictable nature. The range stream feature, originally outlined in
the [v3.5 release blog/Future roadmaps][], remains an idea worth revisiting to address the challenges of large
range queries.
For more details and upcoming plans, please refer to the [etcd roadmap][].
[etcd v3.6.0]: https://github.com/etcd-io/etcd/releases/tag/v3.6.0
[CHANGELOG-3.6]: https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.6.md
[Security Release Process]: https://github.com/etcd-io/etcd/blob/main/security/security-release-process.md
[PR 19113]: https://github.com/etcd-io/etcd/pull/19113
[issues/12913]: https://github.com/etcd-io/etcd/issues/12913
[issues/11716]: https://github.com/etcd-io/etcd/issues/11716
[Downgrade-3.6]: https://etcd.io/docs/v3.6/downgrades/downgrade_3_6/
[feature-gates]: https://etcd.io/docs/v3.6/feature-gates/
[livez/readyz]: https://etcd.io/docs/v3.6/op-guide/monitoring/
[v3discovery]: https://etcd.io/docs/v3.6/dev-internal/discovery_protocol/
[v2discovery]: https://etcd.io/docs/v3.5/dev-internal/discovery_protocol/
[Upgrade etcd from v3.5 to v3.6]: https://etcd.io/docs/v3.6/upgrades/upgrade_3_6/
[PR/13565]: https://github.com/etcd-io/etcd/pull/13565
[issue/13766]: https://github.com/etcd-io/etcd/issues/13766
[PR/13854]: https://github.com/etcd-io/etcd/pull/13854
[issue/14370]: https://github.com/etcd-io/etcd/issues/14370
[PR/14400]: https://github.com/etcd-io/etcd/pull/14400
[PR/14413]: https://github.com/etcd-io/etcd/pull/14413
[PR/14685]: https://github.com/etcd-io/etcd/pull/14685
[PR/14730]: https://github.com/etcd-io/etcd/pull/14730
[PR/18825]: https://github.com/etcd-io/etcd/pull/18825
[PR/419]: https://github.com/etcd-io/bbolt/pull/419
[Robustness testing]: https://github.com/etcd-io/etcd/tree/main/tests/robustness
[Watch APIs]: https://etcd.io/docs/v3.5/learning/api_guarantees/#watch-apis
[issues/15951]: https://github.com/etcd-io/etcd/issues/15951
[supported-platform]: https://etcd.io/docs/v3.6/op-guide/supported-platform/
[dependency_management]: https://github.com/etcd-io/etcd/blob/main/Documentation/contributor-guide/dependency_management.md
[bbolt]: https://github.com/etcd-io/bbolt
[raft]: https://github.com/etcd-io/raft
[grpc-gateway]: https://github.com/grpc-ecosystem/grpc-gateway
[PR/16595]: https://github.com/etcd-io/etcd/pull/16595
[protobuf-go]: https://github.com/protocolbuffers/protobuf-go
[gogo/protobuf]: https://github.com/gogo/protobuf
[patch]: https://github.com/etcd-io/etcd/blob/158b9e0d468d310c3edf4cf13f2458c51b0406fa/scripts/genproto.sh#L151-L184
[grpc-ecosystem/go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus
[grpc-ecosystem/go-grpc-middleware/providers/prometheus]: https://github.com/grpc-ecosystem/go-grpc-middleware/tree/main/providers/prometheus
[PR/19195]: https://github.com/etcd-io/etcd/pull/19195
[issues/19557]: https://github.com/etcd-io/etcd/issues/19557
[upgrade_from_3.5_to_3.6_issue]: https://etcd.io/blog/2025/upgrade_from_3.5_to_3.6_issue/
[WG-etcd-operator]: https://github.com/kubernetes/community/tree/master/wg-etcd-operator
[v3.5 release blog/Future roadmaps]: https://etcd.io/blog/2021/announcing-etcd-3.5/#future-roadmaps
[etcd roadmap]: https://github.com/etcd-io/etcd/blob/main/Documentation/contributor-guide/roadmap.md
[fuweid]: https://github.com/fuweid
[jmhbnz]: https://github.com/jmhbnz
[wenjiaswe]: https://github.com/wenjiaswe
[ivanvc]: https://github.com/ivanvc
[siyuanfoundation]: https://github.com/siyuanfoundation

View File

@ -1,83 +0,0 @@
---
layout: blog
title: "Kubernetes 1.33: Job's SuccessPolicy Goes GA"
date: 2025-05-15T10:30:00-08:00
slug: kubernetes-1-33-jobs-success-policy-goes-ga
authors: >
[Yuki Iwai](https://github.com/tenzen-y) (CyberAgent, Inc)
---
On behalf of the Kubernetes project, I'm pleased to announce that Job _success policy_ has graduated to General Availability (GA) as part of the v1.33 release.
## About Job's Success Policy
In batch workloads, you might want to use leader-follower patterns like [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface),
in which the leader controls the execution, including the followers' lifecycle.
In this case, you might want to mark it as succeeded
even if some of the indexes failed. Unfortunately, a leader-follower Kubernetes Job that didn't use a success policy, in most cases, would have to require **all** Pods to finish successfully
for that Job to reach an overall succeeded state.
For Kubernetes Jobs, the API allows you to specify the early exit criteria using the `.spec.successPolicy`
field (you can only use the `.spec.successPolicy` field for an [indexed Job](/docs/concept/workloads/controllers/job/#completion-mode)).
Which describes a set of rules either using a list of succeeded indexes for a job, or defining a minimal required size of succeeded indexes.
This newly stable field is especially valuable for scientific simulation, AI/ML and High-Performance Computing (HPC) batch workloads.
Users in these areas often run numerous experiments and may only need a specific number to complete successfully, rather than requiring all of them to succeed.
In this case, the leader index failure is the only relevant Job exit criteria, and the outcomes for individual follower Pods are handled
only indirectly via the status of the leader index.
Moreover, followers do not know when they can terminate themselves.
After Job meets any __Success Policy__, the Job is marked as succeeded, and all Pods are terminated including the running ones.
## How it works
The following excerpt from a Job manifest, using `.successPolicy.rules[0].succeededCount`, shows an example of
using a custom success policy:
```yaml
parallelism: 10
completions: 10
completionMode: Indexed
successPolicy:
rules:
- succeededCount: 1
```
Here, the Job is marked as succeeded when one index succeeded regardless of its number.
Additionally, you can constrain index numbers against `succeededCount` in `.successPolicy.rules[0].succeededCount`
as shown below:
```yaml
parallelism: 10
completions: 10
completionMode: Indexed
successPolicy:
rules:
- succeededIndexes: 0 # index of the leader Pod
succeededCount: 1
```
This example shows that the Job will be marked as succeeded once a Pod with a specific index (Pod index 0) has succeeded.
Once the Job either reaches one of the `successPolicy` rules, or achieves its `Complete` criteria based on `.spec.completions`,
the Job controller within kube-controller-manager adds the `SuccessCriteriaMet` condition to the Job status.
After that, the job-controller initiates cleanup and termination of Pods for Jobs with `SuccessCriteriaMet` condition.
Eventually, Jobs obtain `Complete` condition when the job-controller finished cleanup and termination.
## Learn more
- Read the documentation for
[success policy](/docs/concepts/workloads/controllers/job/#success-policy).
- Read the KEP for the [Job success/completion policy](https://github.com/kubernetes/enhancements/tree/master/keps/sig-apps/3998-job-success-completion-policy)
## Get involved
This work was led by the Kubernetes
[batch working group](https://github.com/kubernetes/community/tree/master/wg-batch)
in close collaboration with the
[SIG Apps](https://github.com/kubernetes/community/tree/master/sig-apps) community.
If you are interested in working on new features in the space I recommend
subscribing to our [Slack](https://kubernetes.slack.com/messages/wg-batch)
channel and attending the regular community meetings.

View File

@ -1,111 +0,0 @@
---
layout: blog
title: "Spotlight on Policy Working Group"
slug: wg-policy-spotlight-2025
draft: true
date: 2025-05-22
author: "Arujjwal Negi"
---
In the complex world of Kubernetes, policies play a crucial role in managing and securing clusters. But have you ever wondered how these policies are developed, implemented, and standardized across the Kubernetes ecosystem? To answer that, let's put the spotlight on the Policy Working Group.
The Policy Working Group is dedicated to a critical mission: providing an overall architecture that encompasses both current policy-related implementations and future policy proposals in Kubernetes. Their goal is both ambitious and essential: to develop a universal policy architecture that benefits developers and end-users alike.
Through collaborative methods, this working group is striving to bring clarity and consistency to the often complex world of Kubernetes policies. By focusing on both existing implementations and future proposals, they're working to ensure that the policy landscape in Kubernetes remains coherent and accessible as the technology evolves.
In this blog post, I'll dive deeper into the work of the Policy Working Group, guided by insights from its co-chairs:
- [Jim Bugwadia](https://twitter.com/JimBugwadia)
- [Poonam Lamba](https://twitter.com/poonam-lamba)
- [Andy Suderman](https://twitter.com/sudermanjr)
_Interviewed by [Arujjwal Negi](https://twitter.com/arujjval)._
These co-chairs will explain what the Policy working group is all about.
## Introduction
**Hello, thank you for the time! Lets start with some introductions, could you tell us a bit about yourself, your role, and how you got involved in Kubernetes?**
**Jim Bugwadia**: My name is Jim Bugwadia, and I am a co-founder and the CEO at Nirmata which provides solutions that automate security and compliance for cloud-native workloads. At Nirmata, we have been working with Kubernetes since it started in 2014. We initially built a Kubernetes policy engine in our commercial platform and later donated it to CNCF as the Kyverno project. I joined the CNCF Kubernetes Policy Working Group to help build and standardize various aspects of policy management for Kubernetes and later became a co-chair.
**Andy Suderman**: My name is Andy Suderman and I am the CTO of Fairwinds, a managed Kubernetes-as-a-Service provider. I began working with Kubernetes in 2016 building a web conferencing platform. I am an author and/or maintainer of several Kubernetes-related open-source projects such as Goldilocks, Pluto, and Polaris. Polaris is a JSON-schema-based policy engine, which started Fairwinds' journey into the policy space and my involvement in the Policy Working Group.
**Poonam Lamba**: My name is Poonam Lamba, and I currently work as a Product Manager for Google Kubernetes Engine (GKE) at Google. My journey with Kubernetes began back in 2017 when I was building an SRE platform for a large enterprise, using a private cloud built on Kubernetes. Intrigued by its potential to revolutionize the way we deployed and managed applications at the time, I dove headfirst into learning everything I could about it. Since then, I've had the opportunity to build the policy and compliance products for GKE. I lead and contribute to GKE CIS benchmarks. I am involved with the Gatekeeper project as well as I have contributed to Policy-WG for over 2 years currently I serve as a co-chair for K8s policy WG.
*Response to further questions is represented as an amalgamation of responses from co-chairs*
## About Working Groups
**One thing even I am not aware of is the difference between a working group and a SIG. Can you help us understand what a working group is and how it is different from a SIG?**
Unlike SIGs, working groups are temporary and focused on tackling specific, cross-cutting issues or projects that may involve multiple SIGs. Their lifespan is defined, and they disband once they've achieved their objective. Generally, working groups don't own code or have long-term responsibility for managing a particular area of the Kubernetes project.
(To know more about SIGs, visit the [list of Special Interest Groups](https://github.com/kubernetes/community/blob/master/sig-list.md))
**You mentioned that Working Groups involve multiple SIGS. What SIGS are you closely involved with, and how do you coordinate with them?**
We have collaborated closely with Kubernetes SIG Auth throughout our existence, and more recently, we've also been working with SIG Security since its formation. Our collaboration occurs in a few ways. We provide periodic updates during the SIG meetings to keep them informed of our progress and activities. Additionally, we utilize other community forums to maintain open lines of communication and ensure our work aligns with the broader Kubernetes ecosystem. This collaborative approach helps us stay coordinated with related efforts across the Kubernetes community.
## Policy WG
**Why was the Policy Working Group created?**
To enable a broad set of use cases, we recognize that Kubernetes is powered by a highly declarative, fine-grained, and extensible configuration management system. We've observed that a Kubernetes configuration manifest may have different portions that are important to various stakeholders. For example, some parts may be crucial for developers, while others might be of particular interest to security teams or address operational concerns. Given this complexity, we believe that policies governing the usage of these intricate configurations are essential for success with Kubernetes.
Our Policy Working Group was created specifically to research the standardization of policy definitions and related artifacts. We saw a need to bring consistency and clarity to how policies are defined and implemented across the Kubernetes ecosystem, given the diverse requirements and stakeholders involved in Kubernetes deployments.
**Can you give me an idea of the work you are doing right now?**
We're currently working on several Kubernetes policy-related projects. Our ongoing initiatives include:
- We're developing a Kubernetes Enhancement Proposal (KEP) for the Kubernetes Policy Reports API. This aims to standardize how policy reports are generated and consumed within the Kubernetes ecosystem.
- We're conducting a CNCF survey to better understand policy usage in the Kubernetes space. This will help us gauge current practices and needs across the community.
- We're writing a paper that will guide users in achieving PCI-DSS compliance for containers. This is intended to help organizations meet important security standards in their Kubernetes environments.
- We're also working on a paper highlighting how shifting security down can benefit organizations. This focuses on the advantages of implementing security measures earlier in the development and deployment process.
**Can you tell us about the main objectives of the Policy Working Group and some of your key accomplishments so far? Also, what are your plans for the future?**
The charter of the Policy WG is to help standardize policy management for Kubernetes and educate the community on best practices.
To accomplish this we have updated the Kubernetes documentation ([Policies | Kubernetes](https://kubernetes.io/docs/concepts/policy)), produced several whitepapers ([Kubernetes Policy Management](https://github.com/kubernetes/sig-security/blob/main/sig-security-docs/papers/policy/CNCF_Kubernetes_Policy_Management_WhitePaper_v1.pdf), [Kubernetes GRC](https://github.com/kubernetes/sig-security/blob/main/sig-security-docs/papers/policy_grc/Kubernetes_Policy_WG_Paper_v1_101123.pdf)), and created the Policy Reports API ([API reference](https://htmlpreview.github.io/?https://github.com/kubernetes-sigs/wg-policy-prototypes/blob/master/policy-report/docs/index.html)) which standardizes reporting across various tools. Several popular tools such as Falco, Trivy, Kyverno, kube-bench, and others support the Policy Report API. A major milestone for the Policy WG will be to help promote the Policy Reports API to a SIG-level API or find another stable home for it.
Beyond that, as [ValidatingAdmissionPolicy](https://kubernetes.io/docs/reference/access-authn-authz/validating-admission-policy/) and [MutatingAdmissionPolicy](https://kubernetes.io/docs/reference/access-authn-authz/mutating-admission-policy/) become GA in Kubernetes, we intend to guide and educate the community on the tradeoffs and appropriate usage patterns for these built-in API objects and other CNCF policy management solutions like OPA/Gatekeeper and Kyverno.
## Challenges
**What are some of the major challenges that the Policy Working Group is working on or has worked on?**
During our work in the Policy Working Group, we've encountered several challenges:
- One of the main issues we've faced is finding time to consistently contribute. Given that many of us have other professional commitments, it can be difficult to dedicate regular time to the working group's initiatives.
- Another challenge we've experienced is related to our consensus-driven model. While this approach ensures that all voices are heard, it can sometimes lead to slower decision-making processes. We value thorough discussion and agreement, but this can occasionally delay progress on our projects.
- We've also encountered occasional differences of opinion among group members. These situations require careful navigation to ensure that we maintain a collaborative and productive environment while addressing diverse viewpoints.
- Lastly, we've noticed that newcomers to the group may find it difficult to contribute effectively without consistent attendance at our meetings. The complex nature of our work often requires ongoing context, which can be challenging for those who aren't able to participate regularly.
**Can you tell me more about those challenges? How did you discover each one? What has the impact been? Do you have ideas or strategies about how to address them?**
There are no easy answers, but having more contributors and maintainers greatly helps! Overall the CNCF community is great to work with and is very welcoming to beginners. So, if folks out there are hesitating to get involved, I highly encourage them to attend a WG or SIG meeting and just listen in.
It often takes a few meetings to fully understand the discussions, so don't feel discouraged if you don't grasp everything right away. We've started emphasizing this point and encourage new members to review documentation as a starting point for getting involved.
Additionally, differences of opinion are valued and encouraged within the Policy-WG. We adhere to the CNCF core values and resolve disagreements by maintaining respect for one another. We also strive to timebox our decisions and assign clear responsibilities to keep things moving forward.
## New contributors
**What skills are expected from new contributors and how can they get involved with the Policy Working Group?**
The Policy WG is ideal for anyone who is passionate about Kubernetes security, governance, and compliance and wants to help shape the future of how we build, deploy, and operate cloud-native workloads.
Join the mailing list as described on our community [page](https://github.com/kubernetes/community/blob/master/wg-policy/README.md) and attend one of our upcoming [community meetings](https://github.com/kubernetes/community/tree/master/wg-policy#meetings).
---
This is where our discussion about the Policy Working Group ends. The working group, and especially the people who took part in this article, hope this gave you some insights into the group's aims and workings. Of course, this is just the tip of the iceberg. To learn more and get involved with the Policy Working Group, consider attending their meetings. You can find the schedule and join their [discussions](https://github.com/kubernetes/community/tree/master/wg-policy).

View File

@ -1,26 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="900px" height="250px" style="shape-rendering:geometricPrecision; text-rendering:geometricPrecision; image-rendering:optimizeQuality; fill-rule:evenodd; clip-rule:evenodd" xmlns:xlink="http://www.w3.org/1999/xlink">
<g><path style="opacity:0.908" fill="#fbfcfe" d="M 134.5,17.5 C 137.85,17.335 141.183,17.5017 144.5,18C 170.5,30.3333 196.5,42.6667 222.5,55C 226.894,58.0684 229.728,62.235 231,67.5C 236.872,93.8622 242.872,120.196 249,146.5C 249.61,150.236 249.277,153.903 248,157.5C 230.333,179.833 212.667,202.167 195,224.5C 192.441,227.531 189.274,229.698 185.5,231C 154.5,231.667 123.5,231.667 92.5,231C 88.7257,229.698 85.559,227.531 83,224.5C 66.9068,203.984 50.5734,183.651 34,163.5C 27.7798,155.497 26.7798,146.83 31,137.5C 36.6667,113.167 42.3333,88.8333 48,64.5C 49.7735,59.7271 52.9402,56.2271 57.5,54C 83.2576,41.7854 108.924,29.6188 134.5,17.5 Z"/></g>
<g><path style="opacity:1" fill="#346de5" d="M 134.5,24.5 C 139.08,24.1134 143.414,24.9468 147.5,27C 171.045,38.606 194.712,49.9393 218.5,61C 222.491,63.7785 224.658,67.6119 225,72.5C 229.528,94.2768 234.528,115.943 240,137.5C 241.168,142.482 241.835,147.482 242,152.5C 241.439,154.725 240.439,156.725 239,158.5C 222.427,178.651 206.093,198.984 190,219.5C 188.269,221.617 186.102,223.117 183.5,224C 153.5,224.667 123.5,224.667 93.5,224C 73.0249,201.215 53.8582,177.382 36,152.5C 41.3608,123.356 47.6941,94.3556 55,65.5C 56.5,64 58,62.5 59.5,61C 84.8363,49.3308 109.836,37.1641 134.5,24.5 Z"/></g>
<g><path style="opacity:1" fill="#fafbfe" d="M 133.5,45.5 C 137.167,45.5 140.833,45.5 144.5,45.5C 144.5,52.8333 144.5,60.1667 144.5,67.5C 158.146,68.9079 169.979,74.2412 180,83.5C 186.083,79.5376 191.917,75.2043 197.5,70.5C 199.493,72.6655 201.327,74.9989 203,77.5C 203.749,78.635 203.583,79.635 202.5,80.5C 197.179,84.489 192.179,88.8223 187.5,93.5C 194.894,105.411 198.061,118.411 197,132.5C 198.785,133.24 200.618,133.907 202.5,134.5C 203.471,131.879 204.804,129.546 206.5,127.5C 212.363,132.529 217.697,138.029 222.5,144C 222.355,144.772 222.022,145.439 221.5,146C 214.573,148.476 207.573,150.643 200.5,152.5C 200.5,149.833 200.5,147.167 200.5,144.5C 198.208,144.756 196.041,144.423 194,143.5C 188.976,155.86 180.976,165.86 170,173.5C 170.384,176.309 171.384,178.975 173,181.5C 174.897,179.984 177.064,179.317 179.5,179.5C 178.903,187.153 178.403,194.82 178,202.5C 177.439,203.022 176.772,203.355 176,203.5C 169.677,199.182 163.344,194.848 157,190.5C 156.312,189.668 156.479,189.002 157.5,188.5C 159.332,187.752 160.999,186.752 162.5,185.5C 161.42,183.004 160.086,180.67 158.5,178.5C 145.627,183.814 132.794,183.814 120,178.5C 118.833,180.833 117.667,183.167 116.5,185.5C 117.912,186.806 119.579,187.64 121.5,188C 122.451,188.718 122.617,189.551 122,190.5C 115.505,195.521 108.671,199.854 101.5,203.5C 100.745,195.178 100.078,186.845 99.5,178.5C 101.816,179.36 104.149,179.86 106.5,180C 107.627,178.247 108.627,176.413 109.5,174.5C 97.8509,166.691 89.3509,156.358 84,143.5C 81.9592,144.423 79.7925,144.756 77.5,144.5C 77.8333,147.167 78.1667,149.833 78.5,152.5C 71.0621,150.856 63.7288,148.689 56.5,146C 55.9781,145.439 55.6448,144.772 55.5,144C 60.3409,138.232 65.6742,132.899 71.5,128C 72.3317,127.312 72.9984,127.479 73.5,128.5C 74.3094,130.071 74.6427,131.738 74.5,133.5C 76.7925,133.756 78.9592,133.423 81,132.5C 80.115,118.45 83.2817,105.45 90.5,93.5C 85.5084,88.6769 80.3418,84.0102 75,79.5C 75.7298,75.4517 77.8965,72.6183 81.5,71C 87.0109,75.1809 92.5109,79.3475 98,83.5C 108.046,74.2274 119.879,68.8941 133.5,67.5C 133.5,60.1667 133.5,52.8333 133.5,45.5 Z"/></g>
<g><path style="opacity:0.882" fill="#000000" d="M 858.5,74.5 C 867.424,74.3534 871.257,78.6868 870,87.5C 867.185,93.1691 862.685,95.0024 856.5,93C 850.261,88.7034 849.261,83.3701 853.5,77C 855.315,76.2432 856.981,75.4098 858.5,74.5 Z"/></g>
<g><path style="opacity:1" fill="#356ee5" d="M 127.5,79.5 C 129.5,79.5 131.5,79.5 133.5,79.5C 133.666,89.1724 133.5,98.8391 133,108.5C 132.275,109.059 131.442,109.392 130.5,109.5C 122.292,104.225 114.625,98.2248 107.5,91.5C 113.265,85.9526 119.932,81.9526 127.5,79.5 Z"/></g>
<g><path style="opacity:1" fill="#356de5" d="M 144.5,79.5 C 154.716,80.2764 163.382,84.2764 170.5,91.5C 163.172,97.9916 155.672,104.325 148,110.5C 147,109.833 146,109.167 145,108.5C 144.5,98.8391 144.334,89.1724 144.5,79.5 Z"/></g>
<g><path style="opacity:0.928" fill="#000000" d="M 423.5,83.5 C 424.833,83.5 426.167,83.5 427.5,83.5C 427.5,88.8333 427.5,94.1667 427.5,99.5C 433.833,99.5 440.167,99.5 446.5,99.5C 446.5,104.167 446.5,108.833 446.5,113.5C 440.167,113.5 433.833,113.5 427.5,113.5C 427.13,121.903 427.63,130.236 429,138.5C 430.779,140.764 433.113,142.097 436,142.5C 439.478,141.671 442.978,141.004 446.5,140.5C 446.896,144.375 447.562,148.208 448.5,152C 448.095,152.945 447.428,153.612 446.5,154C 438.116,156.922 429.782,156.922 421.5,154C 415.996,151.16 412.829,146.66 412,140.5C 411.5,122.17 411.333,103.836 411.5,85.5C 415.733,85.4613 419.733,84.7947 423.5,83.5 Z"/></g>
<g><path style="opacity:0.918" fill="#000000" d="M 311.5,98.5 C 321.347,97.9802 331.014,98.9802 340.5,101.5C 341.921,120.529 341.754,139.529 340,158.5C 337.742,166.389 332.575,171.222 324.5,173C 314.057,175.006 303.724,174.506 293.5,171.5C 294.111,166.892 295.111,162.392 296.5,158C 303.028,159.529 309.694,160.196 316.5,160C 322.554,158.957 325.054,155.457 324,149.5C 303.472,154.648 292.305,146.648 290.5,125.5C 291.084,111.263 298.084,102.263 311.5,98.5 Z M 316.5,111.5 C 319.119,111.232 321.619,111.565 324,112.5C 324.167,116.5 324.333,120.5 324.5,124.5C 327.333,136.731 323,140.564 311.5,136C 307.355,130.681 306.522,124.848 309,118.5C 310.767,115.228 313.267,112.895 316.5,111.5 Z"/></g>
<g><path style="opacity:0.94" fill="#000000" d="M 364.5,98.5 C 371.175,98.3337 377.842,98.5004 384.5,99C 391.702,100.869 396.202,105.369 398,112.5C 398.5,126.163 398.667,139.829 398.5,153.5C 387.249,155.423 375.916,155.923 364.5,155C 353.152,151.144 348.985,143.31 352,131.5C 354.443,125.394 358.943,121.894 365.5,121C 371.528,120.83 377.528,120.33 383.5,119.5C 382.625,115.126 379.958,112.626 375.5,112C 369.805,111.623 364.305,112.456 359,114.5C 357.414,109.983 356.58,105.316 356.5,100.5C 359.373,100.198 362.039,99.531 364.5,98.5 Z M 372.5,131.5 C 376.167,131.5 379.833,131.5 383.5,131.5C 383.5,135.167 383.5,138.833 383.5,142.5C 378.728,143.929 374.061,143.595 369.5,141.5C 366.482,136.899 367.482,133.565 372.5,131.5 Z"/></g>
<g><path style="opacity:0.928" fill="#000000" d="M 472.5,98.5 C 497.203,96.5548 507.87,107.888 504.5,132.5C 493.167,132.5 481.833,132.5 470.5,132.5C 470.79,136.961 473.123,139.795 477.5,141C 479.847,141.436 482.181,141.936 484.5,142.5C 489.581,141.61 494.581,140.776 499.5,140C 500.861,144.362 501.528,148.862 501.5,153.5C 491.612,156.456 481.612,156.956 471.5,155C 458.543,150.518 452.543,141.352 453.5,127.5C 453.103,113.266 459.436,103.599 472.5,98.5 Z M 477.5,111.5 C 483.988,111.484 487.988,114.651 489.5,121C 483.175,121.5 476.842,121.666 470.5,121.5C 470.873,116.742 473.206,113.409 477.5,111.5 Z"/></g>
<g><path style="opacity:0.926" fill="#000000" d="M 605.5,98.5 C 612.175,98.3337 618.842,98.5004 625.5,99C 635.288,101.791 640.122,108.291 640,118.5C 640.5,130.162 640.667,141.829 640.5,153.5C 628.91,155.397 617.243,155.897 605.5,155C 594.473,151.455 590.306,143.955 593,132.5C 595.154,125.994 599.654,122.161 606.5,121C 612.491,120.501 618.491,120.334 624.5,120.5C 624.064,115.564 621.397,112.731 616.5,112C 610.805,111.623 605.305,112.456 600,114.5C 598.627,109.928 597.794,105.261 597.5,100.5C 600.373,100.198 603.039,99.531 605.5,98.5 Z M 613.5,131.5 C 617.167,131.5 620.833,131.5 624.5,131.5C 624.5,135.167 624.5,138.833 624.5,142.5C 619.728,143.929 615.061,143.595 610.5,141.5C 607.462,136.989 608.462,133.656 613.5,131.5 Z"/></g>
<g><path style="opacity:0.925" fill="#000000" d="M 742.5,98.5 C 749.175,98.3337 755.842,98.5004 762.5,99C 771.815,101.649 776.649,107.816 777,117.5C 777.5,129.495 777.667,141.495 777.5,153.5C 766.244,155.386 754.911,155.886 743.5,155C 731.751,152.02 727.251,144.52 730,132.5C 732.154,125.994 736.654,122.161 743.5,121C 749.491,120.501 755.491,120.334 761.5,120.5C 761.064,115.564 758.397,112.731 753.5,112C 747.826,111.696 742.326,112.529 737,114.5C 735.627,109.928 734.794,105.261 734.5,100.5C 737.373,100.198 740.039,99.531 742.5,98.5 Z M 750.5,131.5 C 754.167,131.5 757.833,131.5 761.5,131.5C 761.5,135.167 761.5,138.833 761.5,142.5C 757.128,143.885 752.795,143.718 748.5,142C 744.299,137.629 744.966,134.129 750.5,131.5 Z"/></g>
<g><path style="opacity:0.945" fill="#000000" d="M 802.5,98.5 C 832.848,95.8694 845.348,109.536 840,139.5C 837.5,147.333 832.333,152.5 824.5,155C 818.472,155.641 812.472,155.474 806.5,154.5C 806.5,160.833 806.5,167.167 806.5,173.5C 801.167,173.5 795.833,173.5 790.5,173.5C 790.333,149.498 790.5,125.498 791,101.5C 794.917,100.439 798.751,99.4392 802.5,98.5 Z M 806.5,112.5 C 818.841,110.485 824.841,115.652 824.5,128C 824.34,140.262 818.34,144.429 806.5,140.5C 806.5,131.167 806.5,121.833 806.5,112.5 Z"/></g>
<g><path style="opacity:0.919" fill="#000000" d="M 509.5,99.5 C 515.5,99.5 521.5,99.5 527.5,99.5C 529.363,110.955 531.863,122.288 535,133.5C 538.352,122.28 541.186,110.947 543.5,99.5C 547.833,99.5 552.167,99.5 556.5,99.5C 558.225,110.401 560.892,121.068 564.5,131.5C 567.793,120.994 570.46,110.328 572.5,99.5C 578.167,99.5 583.833,99.5 589.5,99.5C 584.799,118.104 578.799,136.271 571.5,154C 567.129,154.828 562.795,154.661 558.5,153.5C 555.493,144.813 552.493,136.146 549.5,127.5C 546.671,136.14 543.838,144.806 541,153.5C 536.55,154.8 532.05,154.8 527.5,153.5C 520.497,135.824 514.497,117.824 509.5,99.5 Z"/></g>
<g><path style="opacity:0.917" fill="#000000" d="M 645.5,99.5 C 651.425,99.1918 657.259,99.5251 663,100.5C 665.869,111.773 669.536,122.773 674,133.5C 677.886,122.345 681.053,111.011 683.5,99.5C 689.167,99.5 694.833,99.5 700.5,99.5C 694.611,121.996 686.445,143.663 676,164.5C 669.118,173.048 660.284,175.881 649.5,173C 647.616,172.784 645.949,172.117 644.5,171C 645.942,166.959 646.942,162.792 647.5,158.5C 651.796,159.463 656.129,159.629 660.5,159C 662.958,157.213 664.624,154.879 665.5,152C 657.154,135.128 650.488,117.628 645.5,99.5 Z"/></g>
<g><path style="opacity:0.95" fill="#000000" d="M 852.5,99.5 C 857.833,99.5 863.167,99.5 868.5,99.5C 868.5,117.833 868.5,136.167 868.5,154.5C 863.167,154.5 857.833,154.5 852.5,154.5C 852.5,136.167 852.5,117.833 852.5,99.5 Z"/></g>
<g><path style="opacity:1" fill="#386ee5" d="M 99.5,100.5 C 107.134,105.665 114.468,111.332 121.5,117.5C 122.833,119.167 122.833,120.833 121.5,122.5C 112.581,125.153 103.581,127.486 94.5,129.5C 92.1812,119.117 93.8478,109.45 99.5,100.5 Z"/></g>
<g><path style="opacity:1" fill="#386fe5" d="M 177.5,100.5 C 184.058,109.086 186.058,118.752 183.5,129.5C 174.476,127.494 165.476,125.328 156.5,123C 155.24,121.186 155.24,119.353 156.5,117.5C 163.753,112.054 170.753,106.387 177.5,100.5 Z"/></g>
<g><path style="opacity:1" fill="#4173e6" d="M 135.5,116.5 C 141.755,115.261 145.422,117.761 146.5,124C 144.602,131.278 140.269,133.111 133.5,129.5C 130.544,124.611 131.211,120.278 135.5,116.5 Z"/></g>
<g><path style="opacity:1" fill="#386fe5" d="M 120.5,134.5 C 122.5,134.5 124.5,134.5 126.5,134.5C 123.684,144.464 119.517,153.797 114,162.5C 105.956,157.595 100.123,150.762 96.5,142C 96.9054,141.055 97.572,140.388 98.5,140C 105.962,138.134 113.295,136.301 120.5,134.5 Z"/></g>
<g><path style="opacity:1" fill="#386ee5" d="M 152.5,133.5 C 161.379,136.092 170.379,138.259 179.5,140C 180.428,140.388 181.095,141.055 181.5,142C 178.209,150.792 172.542,157.626 164.5,162.5C 159.86,154.421 155.693,146.087 152,137.5C 151.421,136.072 151.588,134.738 152.5,133.5 Z"/></g>
<g><path style="opacity:1" fill="#376ee5" d="M 136.5,141.5 C 138.604,141.201 140.604,141.534 142.5,142.5C 146.737,150.968 150.403,159.635 153.5,168.5C 148.384,169.489 143.218,170.156 138,170.5C 133.215,170.678 128.715,169.678 124.5,167.5C 129.059,159.051 133.059,150.384 136.5,141.5 Z"/></g>
</svg>

Before

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,395 +0,0 @@
---
layout: blog
title: "Gateway API v1.3.0: Advancements in Request Mirroring, CORS, Gateway Merging, and Retry Budgets"
date: 2025-06-02T09:00:00-08:00
draft: false
slug: gateway-api-v1-3
author: >
[Candace Holman](https://github.com/candita) (Red Hat)
---
![Gateway API logo](gateway-api-logo.svg)
Join us in the Kubernetes SIG Network community in celebrating the general
availability of [Gateway API](https://gateway-api.sigs.k8s.io/) v1.3.0! We are
also pleased to announce that there are already a number of conformant
implementations to try, made possible by postponing this blog
announcement. Version 1.3.0 of the API was released about a month ago on
April 24, 2025.
Gateway API v1.3.0 brings a new feature to the _Standard_ channel
(Gateway API's GA release channel): _percentage-based request mirroring_, and
introduces three new experimental features: cross-origin resource sharing (CORS)
filters, a standardized mechanism for listener and gateway merging, and retry
budgets.
Also see the full
[release notes](https://github.com/kubernetes-sigs/gateway-api/blob/54df0a899c1c5c845dd3a80f05dcfdf65576f03c/CHANGELOG/1.3-CHANGELOG.md)
and applaud the
[v1.3.0 release team](https://github.com/kubernetes-sigs/gateway-api/blob/54df0a899c1c5c845dd3a80f05dcfdf65576f03c/CHANGELOG/1.3-TEAM.md)
next time you see them.
## Graduation to Standard channel
Graduation to the Standard channel is a notable achievement for Gateway API
features, as inclusion in the Standard release channel denotes a high level of
confidence in the API surface and provides guarantees of backward compatibility.
Of course, as with any other Kubernetes API, Standard channel features can continue
to evolve with backward-compatible additions over time, and we (SIG Network)
certainly expect
further refinements and improvements in the future. For more information on how
all of this works, refer to the [Gateway API Versioning Policy](https://gateway-api.sigs.k8s.io/concepts/versioning/).
### Percentage-based request mirroring
Leads: [Lior Lieberman](https://github.com/LiorLieberman),[Jake Bennert](https://github.com/jakebennert)
GEP-3171: [Percentage-Based Request Mirroring](https://github.com/kubernetes-sigs/gateway-api/blob/main/geps/gep-3171/index.md)
_Percentage-based request mirroring_ is an enhancement to the
existing support for [HTTP request mirroring](https://gateway-api.sigs.k8s.io/guides/http-request-mirroring/), which allows HTTP requests to be duplicated to another backend using the
RequestMirror filter type. Request mirroring is particularly useful in
blue-green deployment. It can be used to assess the impact of request scaling on
application performance without impacting responses to clients.
The previous mirroring capability worked on all the requests to a `backendRef`.
Percentage-based request mirroring allows users to specify a subset of requests
they want to be mirrored, either by percentage or fraction. This can be
particularly useful when services are receiving a large volume of requests.
Instead of mirroring all of those requests, this new feature can be used to
mirror a smaller subset of them.
Here's an example with 42% of the requests to "foo-v1" being mirrored to "foo-v2":
```yaml
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: http-filter-mirror
labels:
gateway: mirror-gateway
spec:
parentRefs:
- name: mirror-gateway
hostnames:
- mirror.example
rules:
- backendRefs:
- name: foo-v1
port: 8080
filters:
- type: RequestMirror
requestMirror:
backendRef:
name: foo-v2
port: 8080
percent: 42 # This value must be an integer.
```
You can also configure the partial mirroring using a fraction. Here is an example
with 5 out of every 1000 requests to "foo-v1" being mirrored to "foo-v2".
```yaml
rules:
- backendRefs:
- name: foo-v1
port: 8080
filters:
- type: RequestMirror
requestMirror:
backendRef:
name: foo-v2
port: 8080
fraction:
numerator: 5
denominator: 1000
```
## Additions to Experimental channel
The Experimental channel is Gateway API's channel for experimenting with new
features and gaining confidence with them before allowing them to graduate to
standard. Please note: the experimental channel may include features that are
changed or removed later.
Starting in release v1.3.0, in an effort to distinguish Experimental channel
resources from Standard channel resources, any new experimental API kinds have the
prefix "**X**". For the same reason, experimental resources are now added to the
API group `gateway.networking.x-k8s.io` instead of `gateway.networking.k8s.io`.
Bear in mind that using new experimental channel resources means they can coexist
with standard channel resources, but migrating these resources to the standard
channel will require recreating them with the standard channel names and API
group (both of which lack the "x-k8s" designator or "X" prefix).
The v1.3 release introduces two new experimental API kinds: XBackendTrafficPolicy
and XListenerSet. To be able to use experimental API kinds, you need to install
the Experimental channel Gateway API YAMLs from the locations listed below.
### CORS filtering
Leads: [Liang Li](https://github.com/liangli), [Eyal Pazz](https://github.com/EyalPazz), [Rob Scott](https://github.com/robscott)
GEP-1767: [CORS Filter](https://github.com/kubernetes-sigs/gateway-api/blob/main/geps/gep-1767/index.md)
Cross-origin resource sharing (CORS) is an HTTP-header based mechanism that allows
a web page to access restricted resources from a server on an origin (domain,
scheme, or port) different from the domain that served the web page. This feature
adds a new HTTPRoute `filter` type, called "CORS", to configure the handling of
cross-origin requests before the response is sent back to the client.
To be able to use experimental CORS filtering, you need to install the
[Experimental channel Gateway API HTTPRoute yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.k8s.io_httproutes.yaml).
Here's an example of a simple cross-origin configuration:
```yaml
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: http-route-cors
spec:
parentRefs:
- name: http-gateway
rules:
- matches:
- path:
type: PathPrefix
value: /resource/foo
filters:
- cors:
- type: CORS
allowOrigins:
- *
allowMethods:
- GET
- HEAD
- POST
allowHeaders:
- Accept
- Accept-Language
- Content-Language
- Content-Type
- Range
backendRefs:
- kind: Service
name: http-route-cors
port: 80
```
In this case, the Gateway returns an _origin header_ of "*", which means that the
requested resource can be referenced from any origin, a _methods header_
(`Access-Control-Allow-Methods`) that permits the `GET`, `HEAD`, and `POST`
verbs, and a _headers header_ allowing `Accept`, `Accept-Language`,
`Content-Language`, `Content-Type`, and `Range`.
```text
HTTP/1.1 200 OK
Access-Control-Allow-Origin: *
Access-Control-Allow-Methods: GET, HEAD, POST
Access-Control-Allow-Headers: Accept,Accept-Language,Content-Language,Content-Type,Range
```
The complete list of fields in the new CORS filter:
* `allowOrigins`
* `allowMethods`
* `allowHeaders`
* `allowCredentials`
* `exposeHeaders`
* `maxAge`
See [CORS protocol](https://fetch.spec.whatwg.org/#http-cors-protocol) for details.
### XListenerSets (standardized mechanism for Listener and Gateway merging){#XListenerSet}
Lead: [Dave Protasowski](https://github.com/dprotaso)
GEP-1713: [ListenerSets - Standard Mechanism to Merge Multiple Gateways](https://github.com/kubernetes-sigs/gateway-api/pull/3213)
This release adds a new experimental API kind, XListenerSet, that allows a
shared list of _listeners_ to be attached to one or more parent Gateway(s). In
addition, it expands upon the existing suggestion that Gateway API implementations
may merge configuration from multiple Gateway objects. It also:
- adds a new field `allowedListeners` to the `.spec` of a Gateway. The
`allowedListeners` field defines from which Namespaces to select XListenerSets
that are allowed to attach to that Gateway: Same, All, None, or Selector based.
- increases the previous maximum number (64) of listeners with the addition of
XListenerSets.
- allows the delegation of listener configuration, such as TLS, to applications in
other namespaces.
To be able to use experimental XListenerSet, you need to install the
[Experimental channel Gateway API XListenerSet yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.x-k8s.io_xlistenersets.yaml).
The following example shows a Gateway with an HTTP listener and two child HTTPS
XListenerSets with unique hostnames and certificates. The combined set of listeners
attached to the Gateway includes the two additional HTTPS listeners in the
XListenerSets that attach to the Gateway. This example illustrates the
delegation of listener TLS config to application owners in different namespaces
("store" and "app"). The HTTPRoute has both the Gateway listener named "foo" and
one XListenerSet listener named "second" as `parentRefs`.
```yaml
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: prod-external
namespace: infra
spec:
gatewayClassName: example
allowedListeners:
- from: All
listeners:
- name: foo
hostname: foo.com
protocol: HTTP
port: 80
---
apiVersion: gateway.networking.x-k8s.io/v1alpha1
kind: XListenerSet
metadata:
name: store
namespace: store
spec:
parentRef:
name: prod-external
listeners:
- name: first
hostname: first.foo.com
protocol: HTTPS
port: 443
tls:
mode: Terminate
certificateRefs:
- kind: Secret
group: ""
name: first-workload-cert
---
apiVersion: gateway.networking.x-k8s.io/v1alpha1
kind: XListenerSet
metadata:
name: app
namespace: app
spec:
parentRef:
name: prod-external
listeners:
- name: second
hostname: second.foo.com
protocol: HTTPS
port: 443
tls:
mode: Terminate
certificateRefs:
- kind: Secret
group: ""
name: second-workload-cert
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: httproute-example
spec:
parentRefs:
- name: app
kind: XListenerSet
sectionName: second
- name: parent-gateway
kind: Gateway
sectionName: foo
...
```
Each listener in a Gateway must have a unique combination of `port`, `protocol`,
(and `hostname` if supported by the protocol) in order for all listeners to be
**compatible** and not conflicted over which traffic they should receive.
Furthermore, implementations can _merge_ separate Gateways into a single set of
listener addresses if all listeners across those Gateways are compatible. The
management of merged listeners was under-specified in releases prior to v1.3.0.
With the new feature, the specification on merging is expanded. Implementations
must treat the parent Gateways as having the merged list of all listeners from
itself and from attached XListenerSets, and validation of this list of listeners
must behave the same as if the list were part of a single Gateway. Within a single
Gateway, listeners are ordered using the following precedence:
1. Single Listeners (not a part of an XListenerSet) first,
2. Remaining listeners ordered by:
- object creation time (oldest first), and if two listeners are defined in
objects that have the same timestamp, then
- alphabetically based on "{namespace}/{name of listener}"
### Retry budgets (XBackendTrafficPolicy) {#XBackendTrafficPolicy}
Leads: [Eric Bishop](https://github.com/ericdbishop), [Mike Morris](https://github.com/mikemorris)
GEP-3388: [Retry Budgets](https://gateway-api.sigs.k8s.io/geps/gep-3388)
This feature allows you to configure a _retry budget_ across all endpoints
of a destination Service. This is used to limit additional client-side retries
after reaching a configured threshold. When configuring the budget, the maximum
percentage of active requests that may consist of retries may be specified, as well as
the interval over which requests will be considered when calculating the threshold
for retries. The development of this specification changed the existing
experimental API kind BackendLBPolicy into a new experimental API kind,
XBackendTrafficPolicy, in the interest of reducing the proliferation of policy
resources that had commonalities.
To be able to use experimental retry budgets, you need to install the
[Experimental channel Gateway API XBackendTrafficPolicy yaml](https://github.com/kubernetes-sigs/gateway-api/blob/main/config/crd/experimental/gateway.networking.x-k8s.io_xbackendtrafficpolicies.yaml).
The following example shows an XBackendTrafficPolicy that applies a
`retryConstraint` that represents a budget that limits the retries to a maximum
of 20% of requests, over a duration of 10 seconds, and to a minimum of 3 retries
over 1 second.
```yaml
apiVersion: gateway.networking.x-k8s.io/v1alpha1
kind: XBackendTrafficPolicy
metadata:
name: traffic-policy-example
spec:
retryConstraint:
budget:
percent: 20
interval: 10s
minRetryRate:
count: 3
interval: 1s
...
```
## Try it out
Unlike other Kubernetes APIs, you don't need to upgrade to the latest version of
Kubernetes to get the latest version of Gateway API. As long as you're running
Kubernetes 1.26 or later, you'll be able to get up and running with this version
of Gateway API.
To try out the API, follow the [Getting Started Guide](https://gateway-api.sigs.k8s.io/guides/).
As of this writing, four implementations are already conformant with Gateway API
v1.3 experimental channel features. In alphabetical order:
- [Airlock Microgateway 4.6](https://github.com/airlock/microgateway/releases/tag/4.6.0)
- [Cilium main](https://github.com/cilium/cilium)
- [Envoy Gateway v1.4.0](https://github.com/envoyproxy/gateway/releases/tag/v1.4.0)
- [Istio 1.27-dev](https://istio.io)
## Get involved
Wondering when a feature will be added? There are lots of opportunities to get
involved and help define the future of Kubernetes routing APIs for both ingress
and service mesh.
* Check out the [user guides](https://gateway-api.sigs.k8s.io/guides) to see what use-cases can be addressed.
* Try out one of the [existing Gateway controllers](https://gateway-api.sigs.k8s.io/implementations/).
* Or [join us in the community](https://gateway-api.sigs.k8s.io/contributing/)
and help us build the future of Gateway API together!
The maintainers would like to thank _everyone_ who's contributed to Gateway
API, whether in the form of commits to the repo, discussion, ideas, or general
support. We could never have made this kind of progress without the support of
this dedicated and active community.
## Related Kubernetes blog articles
* [Gateway API v1.2: WebSockets, Timeouts, Retries, and More](/blog/2024/11/21/gateway-api-v1-2/)
(November 2024)
* [Gateway API v1.1: Service mesh, GRPCRoute, and a whole lot more](/blog/2024/05/09/gateway-api-v1-1/)
(May 2024)
* [New Experimental Features in Gateway API v1.0](/blog/2023/11/28/gateway-api-ga/)
(November 2023)
* [Gateway API v1.0: GA Release](/blog/2023/10/31/gateway-api-ga/)
(October 2023)

View File

@ -1,234 +0,0 @@
---
layout: blog
title: "Start Sidecar First: How To Avoid Snags"
date: 2025-06-03
draft: false
slug: start-sidecar-first
author: Agata Skorupka (The Scale Factory)
---
From the [Kubernetes Multicontainer Pods: An Overview blog post](/blog/2025/04/22/multi-container-pods-overview/) you know what their job is, what are the main architectural patterns, and how they are implemented in Kubernetes. The main thing Ill cover in this article is how to ensure that your sidecar containers start before the main app. Its more complicated than you might think!
## A gentle refresher
I'd just like to remind readers that the [v1.29.0 release of Kubernetes](/blog/2023/12/13/kubernetes-v1-29-release/) added native support for
[sidecar containers](/docs/concepts/workloads/pods/sidecar-containers/), which can now be defined within the `.spec.initContainers` field,
but with `restartPolicy: Always`. You can see that illustrated in the following example Pod manifest snippet:
```yaml
initContainers:
- name: logshipper
image: alpine:latest
restartPolicy: Always # this is what makes it a sidecar container
command: ['sh', '-c', 'tail -F /opt/logs.txt']
volumeMounts:
- name: data
mountPath: /opt
```
What are the specifics of defining sidecars with a `.spec.initContainers` block, rather than as a legacy multi-container pod with multiple `.spec.containers`?
Well, all `.spec.initContainers` are always launched **before** the main application. If you define Kubernetes-native sidecars, those are terminated **after** the main application. Furthermore, when used with [Jobs](/docs/concepts/workloads/controllers/job/), a sidecar container should still be alive and could potentially even restart after the owning Job is complete; Kubernetes-native sidecar containers do not block pod completion.
To learn more, you can also read the official [Pod sidecar containers tutorial](/docs/tutorials/configuration/pod-sidecar-containers/).
## The problem
Now you know that defining a sidecar with this native approach will always start it before the main application. From the [kubelet source code](https://github.com/kubernetes/kubernetes/blob/537a602195efdc04cdf2cb0368792afad082d9fd/pkg/kubelet/kuberuntime/kuberuntime_manager.go#L827-L830), it's visible that this often means being started almost in parallel, and this is not always what an engineer wants to achieve. What I'm really interested in is whether I can delay the start of the main application until the sidecar is not just started, but fully running and ready to serve.
It might be a bit tricky because the problem with sidecars is theres no obvious success signal, contrary to init containers - designed to run only for a specified period of time. With an init container, exit status 0 is unambiguously "I succeeded". With a sidecar, there are lots of points at which you can say "a thing is running".
Starting one container only after the previous one is ready is part of a graceful deployment strategy, ensuring proper sequencing and stability during startup. Its also actually how Id expect sidecar containers to work as well, to cover the scenario where the main application is dependent on the sidecar. For example, it may happen that an app errors out if the sidecar isnt available to serve requests (e.g., logging with DataDog). Sure, one could change the application code (and it would actually be the “best practice” solution), but sometimes they cant - and this post focuses on this use case.
I'll explain some ways that you might try, and show you what approaches will really work.
## Readiness probe
To check whether Kubernetes native sidecar delays the start of the main application until the sidecar is ready, lets simulate a short investigation. Firstly, Ill simulate a sidecar container which will never be ready by implementing a readiness probe which will never succeed. As a reminder, a [readiness probe](/docs/concepts/configuration/liveness-readiness-startup-probes/) checks if the container is ready to start accepting traffic and therefore, if the pod can be used as a backend for services.
(Unlike standard init containers, sidecar containers can have [probes](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/) so that the kubelet can supervise the sidecar and intervene if there are problems. For example, restarting a sidecar container if it fails a health check.)
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp
labels:
app: myapp
spec:
replicas: 1
selector:
matchLabels:
app: myapp
template:
metadata:
labels:
app: myapp
spec:
containers:
- name: myapp
image: alpine:latest
command: ["sh", "-c", "sleep 3600"]
initContainers:
- name: nginx
image: nginx:latest
restartPolicy: Always
ports:
- containerPort: 80
protocol: TCP
readinessProbe:
exec:
command:
- /bin/sh
- -c
- exit 1 # this command always fails, keeping the container "Not Ready"
periodSeconds: 5
volumes:
- name: data
emptyDir: {}
```
The result is:
```console
controlplane $ kubectl get pods -w
NAME READY STATUS RESTARTS AGE
myapp-db5474f45-htgw5 1/2 Running 0 9m28s
controlplane $ kubectl describe pod myapp-db5474f45-htgw5
Name: myapp-db5474f45-htgw5
Namespace: default
(...)
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 17s default-scheduler Successfully assigned default/myapp-db5474f45-htgw5 to node01
Normal Pulling 16s kubelet Pulling image "nginx:latest"
Normal Pulled 16s kubelet Successfully pulled image "nginx:latest" in 163ms (163ms including waiting). Image size: 72080558 bytes.
Normal Created 16s kubelet Created container nginx
Normal Started 16s kubelet Started container nginx
Normal Pulling 15s kubelet Pulling image "alpine:latest"
Normal Pulled 15s kubelet Successfully pulled image "alpine:latest" in 159ms (160ms including waiting). Image size: 3652536 bytes.
Normal Created 15s kubelet Created container myapp
Normal Started 15s kubelet Started container myapp
Warning Unhealthy 1s (x6 over 15s) kubelet Readiness probe failed:
```
From these logs its evident that only one container is ready - and I know it cant be the sidecar, because Ive defined it so itll never be ready (you can also check container statuses in `kubectl get pod -o json`). I also saw that myapp has been started before the sidecar is ready. That was not the result I wanted to achieve; in this case, the main app container has a hard dependency on its sidecar.
## Maybe a startup probe?
To ensure that the sidecar is ready before the main app container starts, I can define a `startupProbe`. It will delay the start of the main container until the command is successfully executed (returns `0` exit status). If youre wondering why Ive added it to my `initContainer`, lets analyse what happens If Id added it to myapp container. I wouldnt have guaranteed the probe would run before the main application code - and this one, can potentially error out without the sidecar being up and running.
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp
labels:
app: myapp
spec:
replicas: 1
selector:
matchLabels:
app: myapp
template:
metadata:
labels:
app: myapp
spec:
containers:
- name: myapp
image: alpine:latest
command: ["sh", "-c", "sleep 3600"]
initContainers:
- name: nginx
image: nginx:latest
ports:
- containerPort: 80
protocol: TCP
restartPolicy: Always
startupProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 30
failureThreshold: 10
timeoutSeconds: 20
volumes:
- name: data
emptyDir: {}
```
This results in 2/2 containers being ready and running, and from events, it can be inferred that the main application started only after nginx had already been started. But to confirm whether it waited for the sidecar readiness, lets change the `startupProbe` to the exec type of command:
```yaml
startupProbe:
exec:
command:
- /bin/sh
- -c
- sleep 15
```
and run `kubectl get pods -w` to watch in real time whether the readiness of both containers only changes after a 15 second delay. Again, events confirm the main application starts after the sidecar.
That means that using the `startupProbe` with a correct `startupProbe.httpGet` request helps to delay the main application start until the sidecar is ready. Its not optimal, but it works.
## What about the postStart lifecycle hook?
Fun fact: using the `postStart` lifecycle hook block will also do the job, but Id have to write my own mini-shell script, which is even less efficient.
```yaml
initContainers:
- name: nginx
image: nginx:latest
restartPolicy: Always
ports:
- containerPort: 80
protocol: TCP
lifecycle:
postStart:
exec:
command:
- /bin/sh
- -c
- |
echo "Waiting for readiness at http://localhost:80"
until curl -sf http://localhost:80; do
echo "Still waiting for http://localhost:80..."
sleep 5
done
echo "Service is ready at http://localhost:80"
```
## Liveness probe
An interesting exercise would be to check the sidecar container behavior with a [liveness probe](/docs/concepts/configuration/liveness-readiness-startup-probes/).
A liveness probe behaves and is configured similarly to a readiness probe - only with the difference that it doesnt affect the readiness of the container but restarts it in case the probe fails.
```yaml
livenessProbe:
exec:
command:
- /bin/sh
- -c
- exit 1 # this command always fails, keeping the container "Not Ready"
periodSeconds: 5
```
After adding the liveness probe configured just as the previous readiness probe and checking events of the pod by `kubectl describe pod` its visible that the sidecar has a restart count above 0. Nevertheless, the main application is not restarted nor influenced at all, even though I'm aware that (in our imaginary worst-case scenario) it can error out when the sidecar is not there serving requests.
What if Id used a `livenessProbe` without lifecycle `postStart`? Both containers will be immediately ready: at the beginning, this behavior will not be different from the one without any additional probes since the liveness probe doesnt affect readiness at all. After a while, the sidecar will begin to restart itself, but it wont influence the main container.
## Findings summary
Ill summarize the startup behavior in the table below:
| Probe/Hook | Sidecar starts before the main app? | Main app waits for the sidecar to be ready? | What if the check doesnt pass? |
|----------------|----------------------------------------------------------|-----------------------------------------------------|----------------------------------------------------|
| `readinessProbe` | **Yes**, but its almost in parallel (effectively **no**) | **No** | Sidecar is not ready; main app continues running |
| `livenessProbe` | Yes, but its almost in parallel (effectively **no**) | **No** | Sidecar is restarted, main app continues running |
| `startupProbe` | **Yes** | **Yes** | Main app is not started |
| postStart | **Yes**, main app container starts after `postStart` completes | **Yes**, but you have to provide custom logic for that | Main app is not started |
To summarize: with sidecars often being a dependency of the main application, you may want to delay the start of the latter until the sidecar is healthy.
The ideal pattern is to start both containers simultaneously and have the app container logic delay at all levels, but its not always possible. If that's what you need, you have to use the right kind of customization to the Pod definition. Thankfully, its nice and quick, and you have the recipe ready above.
Happy deploying!

View File

@ -1,135 +0,0 @@
---
layout: blog
title: "Introducing Gateway API Inference Extension"
date: 2025-06-05
slug: introducing-gateway-api-inference-extension
draft: false
author: >
Daneyon Hansen (Solo.io),
Kaushik Mitra (Google),
Jiaxin Shan (Bytedance),
Kellen Swain (Google)
---
Modern generative AI and large language model (LLM) services create unique traffic-routing challenges
on Kubernetes. Unlike typical short-lived, stateless web requests, LLM inference sessions are often
long-running, resource-intensive, and partially stateful. For example, a single GPU-backed model server
may keep multiple inference sessions active and maintain in-memory token caches.
Traditional load balancers focused on HTTP path or round-robin lack the specialized capabilities needed
for these workloads. They also dont account for model identity or request criticality (e.g., interactive
chat vs. batch jobs). Organizations often patch together ad-hoc solutions, but a standardized approach
is missing.
## Gateway API Inference Extension
[Gateway API Inference Extension](https://gateway-api-inference-extension.sigs.k8s.io/) was created to address
this gap by building on the existing [Gateway API](https://gateway-api.sigs.k8s.io/), adding inference-specific
routing capabilities while retaining the familiar model of Gateways and HTTPRoutes. By adding an inference
extension to your existing gateway, you effectively transform it into an **Inference Gateway**, enabling you to
self-host GenAI/LLMs with a “model-as-a-service” mindset.
The projects goal is to improve and standardize routing to inference workloads across the ecosystem. Key
objectives include enabling model-aware routing, supporting per-request criticalities, facilitating safe model
roll-outs, and optimizing load balancing based on real-time model metrics. By achieving these, the project aims
to reduce latency and improve accelerator (GPU) utilization for AI workloads.
## How it works
The design introduces two new Custom Resources (CRDs) with distinct responsibilities, each aligning with a
specific user persona in the AI/ML serving workflow:
{{< figure src="inference-extension-resource-model.png" alt="Resource Model" class="diagram-large" clicktozoom="true" >}}
1. [InferencePool](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencepool/)
Defines a pool of pods (model servers) running on shared compute (e.g., GPU nodes). The platform admin can
configure how these pods are deployed, scaled, and balanced. An InferencePool ensures consistent resource
usage and enforces platform-wide policies. An InferencePool is similar to a Service but specialized for AI/ML
serving needs and aware of the model-serving protocol.
2. [InferenceModel](https://gateway-api-inference-extension.sigs.k8s.io/api-types/inferencemodel/)
A user-facing model endpoint managed by AI/ML owners. It maps a public name (e.g., "gpt-4-chat") to the actual
model within an InferencePool. This lets workload owners specify which models (and optional fine-tuning) they
want served, plus a traffic-splitting or prioritization policy.
In summary, the InferenceModel API lets AI/ML owners manage what is served, while the InferencePool lets platform
operators manage where and how its served.
## Request flow
The flow of a request builds on the Gateway API model (Gateways and HTTPRoutes) with one or more extra inference-aware
steps (extensions) in the middle. Heres a high-level example of the request flow with the
[Endpoint Selection Extension (ESE)](https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension):
{{< figure src="inference-extension-request-flow.png" alt="Request Flow" class="diagram-large" clicktozoom="true" >}}
1. **Gateway Routing**
A client sends a request (e.g., an HTTP POST to /completions). The Gateway (like Envoy) examines the HTTPRoute
and identifies the matching InferencePool backend.
2. **Endpoint Selection**
Instead of simply forwarding to any available pod, the Gateway consults an inference-specific routing extension—
the Endpoint Selection Extension—to pick the best of the available pods. This extension examines live pod metrics
(queue lengths, memory usage, loaded adapters) to choose the ideal pod for the request.
3. **Inference-Aware Scheduling**
The chosen pod is the one that can handle the request with the lowest latency or highest efficiency, given the
users criticality or resource needs. The Gateway then forwards traffic to that specific pod.
{{< figure src="inference-extension-epp-scheduling.png" alt="Endpoint Extension Scheduling" class="diagram-large" clicktozoom="true" >}}
This extra step provides a smarter, model-aware routing mechanism that still feels like a normal single request to
the client. Additionally, the design is extensible—any Inference Gateway can be enhanced with additional inference-specific
extensions to handle new routing strategies, advanced scheduling logic, or specialized hardware needs. As the project
continues to grow, contributors are encouraged to develop new extensions that are fully compatible with the same underlying
Gateway API model, further expanding the possibilities for efficient and intelligent GenAI/LLM routing.
## Benchmarks
We evaluated this extension against a standard Kubernetes Service for a [vLLM](https://docs.vllm.ai/en/latest/)based model
serving deployment. The test environment consisted of multiple H100 (80 GB) GPU pods running vLLM ([version 1](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html))
on a Kubernetes cluster, with 10 Llama2 model replicas. The [Latency Profile Generator (LPG)](https://github.com/AI-Hypercomputer/inference-benchmark)
tool was used to generate traffic and measure throughput, latency, and other metrics. The
[ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json)
dataset served as the workload, and traffic was ramped from 100 Queries per Second (QPS) up to 1000 QPS.
### Key results
{{< figure src="inference-extension-benchmark.png" alt="Endpoint Extension Scheduling" class="diagram-large" clicktozoom="true" >}}
- **Comparable Throughput**: Throughout the tested QPS range, the ESE delivered throughput roughly on par with a standard
Kubernetes Service.
- **Lower Latency**:
- **PerOutputToken Latency**: The ESE showed significantly lower p90 latency at higher QPS (500+), indicating that
its model-aware routing decisions reduce queueing and resource contention as GPU memory approaches saturation.
- **Overall p90 Latency**: Similar trends emerged, with the ESE reducing endtoend tail latencies compared to the
baseline, particularly as traffic increased beyond 400500 QPS.
These results suggest that this extension's modelaware routing significantly reduced latency for GPUbacked LLM
workloads. By dynamically selecting the leastloaded or bestperforming model server, it avoids hotspots that can
appear when using traditional load balancing methods for large, longrunning inference requests.
## Roadmap
As the Gateway API Inference Extension heads toward GA, planned features include:
1. **Prefix-cache aware load balancing** for remote caches
2. **LoRA adapter pipelines** for automated rollout
3. **Fairness and priority** between workloads in the same criticality band
4. **HPA support** for scaling based on aggregate, per-model metrics
5. **Support for large multi-modal inputs/outputs**
6. **Additional model types** (e.g., diffusion models)
7. **Heterogeneous accelerators** (serving on multiple accelerator types with latency- and cost-aware load balancing)
8. **Disaggregated serving** for independently scaling pools
## Summary
By aligning model serving with Kubernetes-native tooling, Gateway API Inference Extension aims to simplify
and standardize how AI/ML traffic is routed. With model-aware routing, criticality-based prioritization, and
more, it helps ops teams deliver the right LLM services to the right users—smoothly and efficiently.
**Ready to learn more?** Visit the [project docs](https://gateway-api-inference-extension.sigs.k8s.io/) to dive deeper,
give an Inference Gateway extension a try with a few [simple steps](https://gateway-api-inference-extension.sigs.k8s.io/guides/),
and [get involved](https://gateway-api-inference-extension.sigs.k8s.io/contributing/) if youre interested in
contributing to the project!

View File

@ -1,333 +0,0 @@
---
layout: blog
title: "Enhancing Kubernetes Event Management with Custom Aggregation"
date: 2025-06-10
draft: false
slug: enhancing-kubernetes-event-management-custom-aggregation
Author: >
[Rez Moss](https://github.com/rezmoss)
---
Kubernetes [Events](/docs/reference/kubernetes-api/cluster-resources/event-v1/) provide crucial insights into cluster operations, but as clusters grow, managing and analyzing these events becomes increasingly challenging. This blog post explores how to build custom event aggregation systems that help engineering teams better understand cluster behavior and troubleshoot issues more effectively.
## The challenge with Kubernetes events
In a Kubernetes cluster, events are generated for various operations - from pod scheduling and container starts to volume mounts and network configurations. While these events are invaluable for debugging and monitoring, several challenges emerge in production environments:
1. **Volume**: Large clusters can generate thousands of events per minute
2. **Retention**: Default event retention is limited to one hour
3. **Correlation**: Related events from different components are not automatically linked
4. **Classification**: Events lack standardized severity or category classifications
5. **Aggregation**: Similar events are not automatically grouped
To learn more about Events in Kubernetes, read the [Event](/docs/reference/kubernetes-api/cluster-resources/event-v1/) API reference.
## Real-World value
Consider a production environment with tens of microservices where the users report intermittent transaction failures:
**Traditional event aggregation process:** Engineers are wasting hours sifting through thousands of standalone events spread across namespaces. By the time they look into it, the older events have long since purged, and correlating pod restarts to node-level issues is practically impossible.
**With its event aggregation in its custom events:** The system groups events across resources, instantly surfacing correlation patterns such as volume mount timeouts before pod restarts. History indicates it occurred during past record traffic spikes, highlighting a storage scalability issue in minutes rather than hours.
The benefit of this approach is that organizations that implement it commonly cut down their troubleshooting time significantly along with increasing the reliability of systems by detecting patterns early.
## Building an Event aggregation system
This post explores how to build a custom event aggregation system that addresses these challenges, aligned to Kubernetes best practices. I've picked the Go programming language for my example.
### Architecture overview
This event aggregation system consists of three main components:
1. **Event Watcher**: Monitors the Kubernetes API for new events
2. **Event Processor**: Processes, categorizes, and correlates events
3. **Storage Backend**: Stores processed events for longer retention
Here's a sketch for how to implement the event watcher:
```go
package main
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
eventsv1 "k8s.io/api/events/v1"
)
type EventWatcher struct {
clientset *kubernetes.Clientset
}
func NewEventWatcher(config *rest.Config) (*EventWatcher, error) {
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, err
}
return &EventWatcher{clientset: clientset}, nil
}
func (w *EventWatcher) Watch(ctx context.Context) (<-chan *eventsv1.Event, error) {
events := make(chan *eventsv1.Event)
watcher, err := w.clientset.EventsV1().Events("").Watch(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
go func() {
defer close(events)
for {
select {
case event := <-watcher.ResultChan():
if e, ok := event.Object.(*eventsv1.Event); ok {
events <- e
}
case <-ctx.Done():
watcher.Stop()
return
}
}
}()
return events, nil
}
```
### Event processing and classification
The event processor enriches events with additional context and classification:
```go
type EventProcessor struct {
categoryRules []CategoryRule
correlationRules []CorrelationRule
}
type ProcessedEvent struct {
Event *eventsv1.Event
Category string
Severity string
CorrelationID string
Metadata map[string]string
}
func (p *EventProcessor) Process(event *eventsv1.Event) *ProcessedEvent {
processed := &ProcessedEvent{
Event: event,
Metadata: make(map[string]string),
}
// Apply classification rules
processed.Category = p.classifyEvent(event)
processed.Severity = p.determineSeverity(event)
// Generate correlation ID for related events
processed.CorrelationID = p.correlateEvent(event)
// Add useful metadata
processed.Metadata = p.extractMetadata(event)
return processed
}
```
### Implementing Event correlation
One of the key features you could implement is a way of correlating related Events.
Here's an example correlation strategy:
```go
func (p *EventProcessor) correlateEvent(event *eventsv1.Event) string {
// Correlation strategies:
// 1. Time-based: Events within a time window
// 2. Resource-based: Events affecting the same resource
// 3. Causation-based: Events with cause-effect relationships
correlationKey := generateCorrelationKey(event)
return correlationKey
}
func generateCorrelationKey(event *eventsv1.Event) string {
// Example: Combine namespace, resource type, and name
return fmt.Sprintf("%s/%s/%s",
event.InvolvedObject.Namespace,
event.InvolvedObject.Kind,
event.InvolvedObject.Name,
)
}
```
## Event storage and retention
For long-term storage and analysis, you'll probably want a backend that supports:
- Efficient querying of large event volumes
- Flexible retention policies
- Support for aggregation queries
Here's a sample storage interface:
```go
type EventStorage interface {
Store(context.Context, *ProcessedEvent) error
Query(context.Context, EventQuery) ([]ProcessedEvent, error)
Aggregate(context.Context, AggregationParams) ([]EventAggregate, error)
}
type EventQuery struct {
TimeRange TimeRange
Categories []string
Severity []string
CorrelationID string
Limit int
}
type AggregationParams struct {
GroupBy []string
TimeWindow string
Metrics []string
}
```
## Good practices for Event management
1. **Resource Efficiency**
- Implement rate limiting for event processing
- Use efficient filtering at the API server level
- Batch events for storage operations
2. **Scalability**
- Distribute event processing across multiple workers
- Use leader election for coordination
- Implement backoff strategies for API rate limits
3. **Reliability**
- Handle API server disconnections gracefully
- Buffer events during storage backend unavailability
- Implement retry mechanisms with exponential backoff
## Advanced features
### Pattern detection
Implement pattern detection to identify recurring issues:
```go
type PatternDetector struct {
patterns map[string]*Pattern
threshold int
}
func (d *PatternDetector) Detect(events []ProcessedEvent) []Pattern {
// Group similar events
groups := groupSimilarEvents(events)
// Analyze frequency and timing
patterns := identifyPatterns(groups)
return patterns
}
func groupSimilarEvents(events []ProcessedEvent) map[string][]ProcessedEvent {
groups := make(map[string][]ProcessedEvent)
for _, event := range events {
// Create similarity key based on event characteristics
similarityKey := fmt.Sprintf("%s:%s:%s",
event.Event.Reason,
event.Event.InvolvedObject.Kind,
event.Event.InvolvedObject.Namespace,
)
// Group events with the same key
groups[similarityKey] = append(groups[similarityKey], event)
}
return groups
}
func identifyPatterns(groups map[string][]ProcessedEvent) []Pattern {
var patterns []Pattern
for key, events := range groups {
// Only consider groups with enough events to form a pattern
if len(events) < 3 {
continue
}
// Sort events by time
sort.Slice(events, func(i, j int) bool {
return events[i].Event.LastTimestamp.Time.Before(events[j].Event.LastTimestamp.Time)
})
// Calculate time range and frequency
firstSeen := events[0].Event.FirstTimestamp.Time
lastSeen := events[len(events)-1].Event.LastTimestamp.Time
duration := lastSeen.Sub(firstSeen).Minutes()
var frequency float64
if duration > 0 {
frequency = float64(len(events)) / duration
}
// Create a pattern if it meets threshold criteria
if frequency > 0.5 { // More than 1 event per 2 minutes
pattern := Pattern{
Type: key,
Count: len(events),
FirstSeen: firstSeen,
LastSeen: lastSeen,
Frequency: frequency,
EventSamples: events[:min(3, len(events))], // Keep up to 3 samples
}
patterns = append(patterns, pattern)
}
}
return patterns
}
```
With this implementation, the system can identify recurring patterns such as node pressure events, pod scheduling failures, or networking issues that occur with a specific frequency.
### Real-time alerts
The following example provides a starting point for building an alerting system based on event patterns. It is not a complete solution but a conceptual sketch to illustrate the approach.
```go
type AlertManager struct {
rules []AlertRule
notifiers []Notifier
}
func (a *AlertManager) EvaluateEvents(events []ProcessedEvent) {
for _, rule := range a.rules {
if rule.Matches(events) {
alert := rule.GenerateAlert(events)
a.notify(alert)
}
}
}
```
## Conclusion
A well-designed event aggregation system can significantly improve cluster observability and troubleshooting capabilities. By implementing custom event processing, correlation, and storage, operators can better understand cluster behavior and respond to issues more effectively.
The solutions presented here can be extended and customized based on specific requirements while maintaining compatibility with the Kubernetes API and following best practices for scalability and reliability.
## Next steps
Future enhancements could include:
- Machine learning for anomaly detection
- Integration with popular observability platforms
- Custom event APIs for application-specific events
- Enhanced visualization and reporting capabilities
For more information on Kubernetes events and custom [controllers](/docs/concepts/architecture/controller/),
refer to the official Kubernetes [documentation](/docs/).

View File

@ -1,23 +0,0 @@
---
layout: blog
title: "Changes to Kubernetes Slack"
date: 2025-06-16
canonicalUrl: https://www.kubernetes.dev/blog/2025/06/16/changes-to-kubernetes-slack-2025/
slug: changes-to-kubernetes-slack
Author: >
[Josh Berkus](https://github.com/jberkus)
---
**UPDATE**: Weve received notice from Salesforce that our Slack workspace **WILL NOT BE DOWNGRADED** on June 20th. Stand by for more details, but for now, there is no urgency to back up private channels or direct messages.
~~Kubernetes Slack will lose its special status and will be changing into a standard free Slack on June 20, 2025~~. Sometime later this year, our community may move to a new platform. If you are responsible for a channel or private channel, or a member of a User Group, you will need to take some actions as soon as you can.
For the last decade, Slack has supported our project with a free customized enterprise account. They have let us know that they can no longer do so, particularly since our Slack is one of the largest and more active ones on the platform. As such, they will be downgrading it to a standard free Slack while we decide on, and implement, other options.
On Friday, June 20, we will be subject to the [feature limitations of free Slack](https://slack.com/help/articles/27204752526611-Feature-limitations-on-the-free-version-of-Slack). The primary ones which will affect us will be only retaining 90 days of history, and having to disable several apps and workflows which we are currently using. The Slack Admin team will do their best to manage these limitations.
Responsible channel owners, members of private channels, and members of User Groups should [take some actions](https://github.com/kubernetes/community/blob/master/communication/slack-migration-faq.md#what-actions-do-channel-owners-and-user-group-members-need-to-take-soon) to prepare for the upgrade and preserve information as soon as possible.
The CNCF Projects Staff have proposed that our community look at migrating to Discord. Because of existing issues where we have been pushing the limits of Slack, they have already explored what a Kubernetes Discord would look like. Discord would allow us to implement new tools and integrations which would help the community, such as GitHub group membership synchronization. The Steering Committee will discuss and decide on our future platform.
Please see our [FAQ](https://github.com/kubernetes/community/blob/master/communication/slack-migration-faq.md), and check the [kubernetes-dev mailing list](https://groups.google.com/a/kubernetes.io/g/dev/) and the [#announcements channel](https://kubernetes.slack.com/archives/C9T0QMNG4) for further news. If you have specific feedback on our Slack status join the [discussion on GitHub](https://github.com/kubernetes/community/issues/8490).

View File

@ -1,170 +0,0 @@
---
layout: blog
title: "Image Compatibility In Cloud Native Environments"
date: 2025-06-25
draft: false
slug: image-compatibility-in-cloud-native-environments
author: >
Chaoyi Huang (Huawei),
Marcin Franczyk (Huawei),
Vanessa Sochat (Lawrence Livermore National Laboratory)
---
In industries where systems must run very reliably and meet strict performance criteria such as telecommunication, high-performance or AI computing, containerized applications often need specific operating system configuration or hardware presence.
It is common practice to require the use of specific versions of the kernel, its configuration, device drivers, or system components.
Despite the existence of the [Open Container Initiative (OCI)](https://opencontainers.org/), a governing community to define standards and specifications for container images, there has been a gap in expression of such compatibility requirements.
The need to address this issue has led to different proposals and, ultimately, an implementation in Kubernetes' [Node Feature Discovery (NFD)](https://kubernetes-sigs.github.io/node-feature-discovery/stable/get-started/index.html).
[NFD](https://kubernetes-sigs.github.io/node-feature-discovery/stable/get-started/index.html) is an open source Kubernetes project that automatically detects and reports [hardware and system features](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/customization-guide.html#available-features) of cluster nodes. This information helps users to schedule workloads on nodes that meet specific system requirements, which is especially useful for applications with strict hardware or operating system dependencies.
## The need for image compatibility specification
### Dependencies between containers and host OS
A container image is built on a base image, which provides a minimal runtime environment, often a stripped-down Linux userland, completely empty or distroless. When an application requires certain features from the host OS, compatibility issues arise. These dependencies can manifest in several ways:
- **Drivers**:
Host driver versions must match the supported range of a library version inside the container to avoid compatibility problems. Examples include GPUs and network drivers.
- **Libraries or Software**:
The container must come with a specific version or range of versions for a library or software to run optimally in the environment. Examples from high performance computing are MPI, EFA, or Infiniband.
- **Kernel Modules or Features**:
Specific kernel features or modules must be present. Examples include having support of write protected huge page faults, or the presence of VFIO
- And more…
While containers in Kubernetes are the most likely unit of abstraction for these needs, the definition of compatibility can extend further to include other container technologies such as Singularity and other OCI artifacts such as binaries from a spack binary cache.
### Multi-cloud and hybrid cloud challenges
Containerized applications are deployed across various Kubernetes distributions and cloud providers, where different host operating systems introduce compatibility challenges.
Often those have to be pre-configured before workload deployment or are immutable.
For instance, different cloud providers will include different operating systems like:
- **RHCOS/RHEL**
- **Photon OS**
- **Amazon Linux 2**
- **Container-Optimized OS**
- **Azure Linux OS**
- And more...
Each OS comes with unique kernel versions, configurations, and drivers, making compatibility a non-trivial issue for applications requiring specific features.
It must be possible to quickly assess a container for its suitability to run on any specific environment.
### Image compatibility initiative
An effort was made within the [Open Containers Initiative Image Compatibility](https://github.com/opencontainers/wg-image-compatibility) working group to introduce a standard for image compatibility metadata.
A specification for compatibility would allow container authors to declare required host OS features, making compatibility requirements discoverable and programmable.
The specification implemented in Kubernetes Node Feature Discovery is one of the discussed proposals.
It aims to:
- **Define a structured way to express compatibility in OCI image manifests.**
- **Support a compatibility specification alongside container images in image registries.**
- **Allow automated validation of compatibility before scheduling containers.**
The concept has since been implemented in the Kubernetes Node Feature Discovery project.
### Implementation in Node Feature Discovery
The solution integrates compatibility metadata into Kubernetes via NFD features and the [NodeFeatureGroup](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup) API.
This interface enables the user to match containers to nodes based on exposing features of hardware and software, allowing for intelligent scheduling and workload optimization.
### Compatibility specification
The compatibility specification is a structured list of compatibility objects containing *[Node Feature Groups](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup)*.
These objects define image requirements and facilitate validation against host nodes.
The feature requirements are described by using [the list of available features](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/customization-guide.html#available-features) from the NFD project.
The schema has the following structure:
- **version** (string) - Specifies the API version.
- **compatibilities** (array of objects) - List of compatibility sets.
- **rules** (object) - Specifies [NodeFeatureGroup](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/usage/custom-resources.html#nodefeaturegroup) to define image requirements.
- **weight** (int, optional) - Node affinity weight.
- **tag** (string, optional) - Categorization tag.
- **description** (string, optional) - Short description.
An example might look like the following:
```yaml
version: v1alpha1
compatibilities:
- description: "My image requirements"
rules:
- name: "kernel and cpu"
matchFeatures:
- feature: kernel.loadedmodule
matchExpressions:
vfio-pci: {op: Exists}
- feature: cpu.model
matchExpressions:
vendor_id: {op: In, value: ["Intel", "AMD"]}
- name: "one of available nics"
matchAny:
- matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["0eee"]}
class: {op: In, value: ["0200"]}
- matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["0fff"]}
class: {op: In, value: ["0200"]}
```
### Client implementation for node validation
To streamline compatibility validation, we implemented a [client tool](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/reference/node-feature-client-reference.html) that allows for node validation based on an image's compatibility artifact.
In this workflow, the image author would generate a compatibility artifact that points to the image it describes in a registry via the referrers API.
When a need arises to assess the fit of an image to a host, the tool can discover the artifact and verify compatibility of an image to a node before deployment.
The client can validate nodes both inside and outside a Kubernetes cluster, extending the utility of the tool beyond the single Kubernetes use case.
In the future, image compatibility could play a crucial role in creating specific workload profiles based on image compatibility requirements, aiding in more efficient scheduling.
Additionally, it could potentially enable automatic node configuration to some extent, further optimizing resource allocation and ensuring seamless deployment of specialized workloads.
### Examples of usage
1. **Define image compatibility metadata**
A [container image](/docs/concepts/containers/images) can have metadata that describes
its requirements based on features discovered from nodes, like kernel modules or CPU models.
The previous compatibility specification example in this article exemplified this use case.
2. **Attach the artifact to the image**
The image compatibility specification is stored as an OCI artifact.
You can attach this metadata to your container image using the [oras](https://oras.land/) tool.
The registry only needs to support OCI artifacts, support for arbitrary types is not required.
Keep in mind that the container image and the artifact must be stored in the same registry.
Use the following command to attach the artifact to the image:
```bash
oras attach \
--artifact-type application/vnd.nfd.image-compatibility.v1alpha1 <image-url> \
<path-to-spec>.yaml:application/vnd.nfd.image-compatibility.spec.v1alpha1+yaml
```
3. **Validate image compatibility**
After attaching the compatibility specification, you can validate whether a node meets the
image's requirements. This validation can be done using the
[nfd client](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/reference/node-feature-client-reference.html):
```bash
nfd compat validate-node --image <image-url>
```
4. **Read the output from the client**
Finally you can read the report generated by the tool or use your own tools to act based on the generated JSON report.
![validate-node command output](validate-node-output.png)
## Conclusion
The addition of image compatibility to Kubernetes through Node Feature Discovery underscores the growing importance of addressing compatibility in cloud native environments.
It is only a start, as further work is needed to integrate compatibility into scheduling of workloads within and outside of Kubernetes.
However, by integrating this feature into Kubernetes, mission-critical workloads can now define and validate host OS requirements more efficiently.
Moving forward, the adoption of compatibility metadata within Kubernetes ecosystems will significantly enhance the reliability and performance of specialized containerized applications, ensuring they meet the stringent requirements of industries like telecommunications, high-performance computing or any environment that requires special hardware or host OS configuration.
## Get involved
Join the [Kubernetes Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/contributing/) project if you're interested in getting involved with the design and development of Image Compatibility API and tools.
We always welcome new contributors.

View File

@ -1,410 +0,0 @@
---
layout: blog
title: "Navigating Failures in Pods With Devices"
date: 2025-07-03
slug: navigating-failures-in-pods-with-devices
draft: false
author: >
Sergey Kanzhelev (Google)
Mrunal Patel (RedHat)
---
Kubernetes is the de facto standard for container orchestration, but when it
comes to handling specialized hardware like GPUs and other accelerators, things
get a bit complicated. This blog post dives into the challenges of managing
failure modes when operating pods with devices in Kubernetes, based on insights
from [Sergey Kanzhelev and Mrunal Patel's talk at KubeCon NA
2024](https://sched.co/1i7pT). You can follow the links to
[slides](https://static.sched.com/hosted_files/kccncna2024/b9/KubeCon%20NA%202024_%20Navigating%20Failures%20in%20Pods%20With%20Devices_%20Challenges%20and%20Solutions.pptx.pdf?_gl=1*191m4j5*_gcl_au*MTU1MDM0MTM1My4xNzMwOTE4ODY5LjIxNDI4Nzk1NDIuMTczMTY0ODgyMC4xNzMxNjQ4ODIy*FPAU*MTU1MDM0MTM1My4xNzMwOTE4ODY5)
and
[recording](https://www.youtube.com/watch?v=-YCnOYTtVO8&list=PLj6h78yzYM2Pw4mRw4S-1p_xLARMqPkA7&index=150).
## The AI/ML boom and its impact on Kubernetes
The rise of AI/ML workloads has brought new challenges to Kubernetes. These
workloads often rely heavily on specialized hardware, and any device failure can
significantly impact performance and lead to frustrating interruptions. As
highlighted in the 2024 [Llama
paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/),
hardware issues, particularly GPU failures, are a major cause of disruption in
AI/ML training. You can also learn how much effort NVIDIA spends on handling
devices failures and maintenance in the KubeCon talk by [Ryan Hallisey and Piotr
Prokop All-Your-GPUs-Are-Belong-to-Us: An Inside Look at NVIDIA's Self-Healing
GeForce NOW
Infrastructure](https://kccncna2024.sched.com/event/1i7kJ/all-your-gpus-are-belong-to-us-an-inside-look-at-nvidias-self-healing-geforce-now-infrastructure-ryan-hallisey-piotr-prokop-pl-nvidia)
([recording](https://www.youtube.com/watch?v=iLnHtKwmu2I)) as they see 19
remediation requests per 1000 nodes a day!
We also see data centers offering spot consumption models and overcommit on
power, making device failures commonplace and a part of the business model.
However, Kubernetess view on resources is still very static. The resource is
either there or not. And if it is there, the assumption is that it will stay
there fully functional - Kubernetes lacks good support for handling full or partial
hardware failures. These long-existing assumptions combined with the overall complexity of a setup lead
to a variety of failure modes, which we discuss here.
### Understanding AI/ML workloads
Generally, all AI/ML workloads require specialized hardware, have challenging
scheduling requirements, and are expensive when idle. AI/ML workloads typically
fall into two categories - training and inference. Here is an oversimplified
view of those categories characteristics, which are different from traditional workloads
like web services:
Training
: These workloads are resource-intensive, often consuming entire
machines and running as gangs of pods. Training jobs are usually "run to
completion" - but that could be days, weeks or even months. Any failure in a
single pod can necessitate restarting the entire step across all the pods.
Inference
: These workloads are usually long-running or run indefinitely,
and can be small enough to consume a subset of a Nodes devices or large enough to span
multiple nodes. They often require downloading huge files with the model
weights.
These workload types specifically break many past assumptions:
{{< table caption="Workload assumptions before and now" >}}
| Before | Now |
| :---- | :---- |
| Can get a better CPU and the app will work faster. | Require a **specific** device (or **class of devices**) to run. |
| When something doesnt work, just recreate it. | Allocation or reallocation is expensive. |
| Any node will work. No need to coordinate between Pods. | Scheduled in a special way - devices often connected in a cross-node topology. |
| Each Pod can be plug-and-play replaced if failed. | Pods are a part of a larger task. Lifecycle of an entire task depends on each Pod. |
| Container images are slim and easily available. | Container images may be so big that they require special handling. |
| Long initialization can be offset by slow rollout. | Initialization may be long and should be optimized, sometimes across many Pods together. |
| Compute nodes are commoditized and relatively inexpensive, so some idle time is acceptable. | Nodes with specialized hardware can be an order of magnitude more expensive than those without, so idle time is very wasteful. |
{{< /table >}}
The existing failure model was relying on old assumptions. It may still work for
the new workload types, but it has limited knowledge about devices and is very
expensive for them. In some cases, even prohibitively expensive. You will see
more examples later in this article.
### Why Kubernetes still reigns supreme
This article is not going deeper into the question: why not start fresh for
AI/ML workloads since they are so different from the traditional Kubernetes
workloads. Despite many challenges, Kubernetes remains the platform of choice
for AI/ML workloads. Its maturity, security, and rich ecosystem of tools make it
a compelling option. While alternatives exist, they often lack the years of
development and refinement that Kubernetes offers. And the Kubernetes developers
are actively addressing the gaps identified in this article and beyond.
## The current state of device failure handling
This section outlines different failure modes and the best practices and DIY
(Do-It-Yourself) solutions used today. The next session will describe a roadmap
of improving things for those failure modes.
### Failure modes: K8s infrastructure
In order to understand the failures related to the Kubernetes infrastructure,
you need to understand how many moving parts are involved in scheduling a Pod on
the node. The sequence of events when the Pod is scheduled in the Node is as
follows:
1. *Device plugin* is scheduled on the Node
1. *Device plugin* is registered with the *kubelet* via local gRPC
1. *Kubelet* uses *device plugin* to watch for devices and updates capacity of
the node
1. *Scheduler* places a *user Pod* on a Node based on the updated capacity
1. *Kubelet* asks *Device plugin* to **Allocate** devices for a *User Pod*
1. *Kubelet* creates a *User Pod* with the allocated devices attached to it
This diagram shows some of those actors involved:
{{< figure src="k8s-infra-devices.svg" alt="The diagram shows relationships between the kubelet, Device plugin, and a user Pod. It shows that kubelet connects to the Device plugin named my-device, kubelet reports the node status with the my-device availability, and the user Pod requesting the 2 of my-device." >}}
As there are so many actors interconnected, every one of them and every
connection may experience interruptions. This leads to many exceptional
situations that are often considered failures, and may cause serious workload
interruptions:
* Pods failing admission at various stages of its lifecycle
* Pods unable to run on perfectly fine hardware
* Scheduling taking unexpectedly long time
{{< figure src="k8s-infra-failures.svg" alt="The same diagram as one above it, however it has an overlayed orange bang drawings over individual components with the text indicating what can break in that component. Over the kubelet text reads: 'kubelet restart: looses all devices info before re-Watch'. Over the Device plugin text reads: 'device plugin update, evictIon, restart: kubelet cannot Allocate devices or loses all devices state'. Over the user Pod text reads: 'slow pod termination: devices are unavailable'." >}}
The goal for Kubernetes is to make the interruption between these components as
reliable as possible. Kubelet already implements retries, grace periods, and
other techniques to improve it. The roadmap section goes into details on other
edge cases that the Kubernetes project tracks. However, all these improvements
only work when these best practices are followed:
* Configure and restart kubelet and the container runtime (such as containerd or CRI-O)
as early as possible to not interrupt the workload.
* Monitor device plugin health and carefully plan for upgrades.
* Do not overload the node with less-important workloads to prevent interruption
of device plugin and other components.
* Configure user pods tolerations to handle node readiness flakes.
* Configure and code graceful termination logic carefully to not block devices
for too long.
Another class of Kubernetes infra-related issues is driver-related. With
traditional resources like CPU and memory, no compatibility checks between the
application and hardware were needed. With special devices like hardware
accelerators, there are new failure modes. Device drivers installed on the node:
* Must match the hardware
* Be compatible with an app
* Must work with other drivers (like [nccl](https://developer.nvidia.com/nccl),
etc.)
Best practices for handling driver versions:
* Monitor driver installer health
* Plan upgrades of infrastructure and Pods to match the version
* Have canary deployments whenever possible
Following the best practices in this section and using device plugins and device
driver installers from trusted and reliable sources generally eliminate this
class of failures. Kubernetes is tracking work to make this space even better.
### Failure modes: device failed
There is very little handling of device failure in Kubernetes today. Device
plugins report the device failure only by changing the count of allocatable
devices. And Kubernetes relies on standard mechanisms like liveness probes or
container failures to allow Pods to communicate the failure condition to the
kubelet. However, Kubernetes does not correlate device failures with container
crashes and does not offer any mitigation beyond restarting the container while
being attached to the same device.
This is why many plugins and DIY solutions exist to handle device failures based
on various signals.
#### Health controller
In many cases a failed device will result in unrecoverable and very expensive
nodes doing nothing. A simple DIY solution is a _node health controller_. The
controller could compare the device allocatable count with the capacity and if
the capacity is greater, it starts a timer. Once the timer reaches a threshold,
the health controller kills and recreates a node.
There are problems with the _health controller_ approach:
* Root cause of the device failure is typically not known
* The controller is not workload aware
* Failed device might not be in use and you want to keep other devices running
* The detection may be too slow as it is very generic
* The node may be part of a bigger set of nodes and simply cannot be deleted in
isolation without other nodes
There are variations of the health controller solving some of the problems
above. The overall theme here though is that to best handle failed devices, you
need customized handling for the specific workload. Kubernetes doesnt yet offer
enough abstraction to express how critical the device is for a node, for the
cluster, and for the Pod it is assigned to.
#### Pod failure policy
Another DIY approach for device failure handling is a per-pod reaction on a
failed device. This approach is applicable for *training* workloads that are
implemented as Jobs.
Pod can define special error codes for device failures. For example, whenever
unexpected device behavior is encountered, Pod exits with a special exit code.
Then the Pod failure policy can handle the device failure in a special way. Read
more on [Handling retriable and non-retriable pod failures with Pod failure
policy](/docs/concepts/workloads/controllers/job/#pod-failure-policy)
There are some problems with the _Pod failure policy_ approach for Jobs:
* There is no well-known _device failed_ condition, so this approach does not work for the
generic Pod case
* Error codes must be coded carefully and in some cases are hard to guarantee.
* Only works with Jobs with `restartPolicy: Never`, due to the limitation of a pod
failure policy feature.
So, this solution has limited applicability.
#### Custom pod watcher
A little more generic approach is to implement the Pod watcher as a DIY solution
or use some third party tools offering this functionality. The pod watcher is
most often used to handle device failures for inference workloads.
Since Kubernetes just keeps a pod assigned to a device, even if the device is
reportedly unhealthy, the idea is to detect this situation with the pod watcher
and apply some remediation. It often involves obtaining device health status and
its mapping to the Pod using Pod Resources API on the node. If a device fails,
it can then delete the attached Pod as a remediation. The replica set will
handle the Pod recreation on a healthy device.
The other reasons to implement this watcher:
* Without it, the Pod will keep being assigned to the failed device forever.
* There is no _descheduling_ for a pod with `restartPolicy=Always`.
* There are no built-in controllers that delete Pods in CrashLoopBackoff.
Problems with the _custom pod watcher_:
* The signal for the pod watcher is expensive to get, and involves some
privileged actions.
* It is a custom solution and it assumes the importance of a device for a Pod.
* The pod watcher relies on external controllers to reschedule a Pod.
There are more variations of DIY solutions for handling device failures or
upcoming maintenance. Overall, Kubernetes has enough extension points to
implement these solutions. However, some extension points require higher
privilege than users may be comfortable with or are too disruptive. The roadmap
section goes into more details on specific improvements in handling the device
failures.
### Failure modes: container code failed
When the container code fails or something bad happens with it, like out of
memory conditions, Kubernetes knows how to handle those cases. There is either
the restart of a container, or a crash of a Pod if it has `restartPolicy: Never`
and scheduling it on another node. Kubernetes has limited expressiveness on what
is a failure (for example, non-zero exit code or liveness probe failure) and how
to react on such a failure (mostly either Always restart or immediately fail the
Pod).
This level of expressiveness is often not enough for the complicated AI/ML
workloads. AI/ML pods are better rescheduled locally or even in-place as that
would save on image pulling time and device allocation. AI/ML pods are often
interconnected and need to be restarted together. This adds another level of
complexity and optimizing it often brings major savings in running AI/ML
workloads.
There are various DIY solutions to handle Pod failures orchestration. The most
typical one is to wrap a main executable in a container by some orchestrator.
And this orchestrator will be able to restart the main executable whenever the
job needs to be restarted because some other pod has failed.
Solutions like this are very fragile and elaborate. They are often worth the
money saved comparing to a regular JobSet delete/recreate cycle when used in
large training jobs. Making these solutions less fragile and more streamlined
by developing new hooks and extension points in Kubernetes will make it
easy to apply to smaller jobs, benefiting everybody.
### Failure modes: device degradation
Not all device failures are terminal for the overall workload or batch job.
As the hardware stack gets more and more
complex, misconfiguration on one of the hardware stack layers, or driver
failures, may result in devices that are functional, but lagging on performance.
One device that is lagging behind can slow down the whole training job.
We see reports of such cases more and more often. Kubernetes has no way to
express this type of failures today and since it is the newest type of failure
mode, there is not much of a best practice offered by hardware vendors for
detection and third party tooling for remediation of these situations.
Typically, these failures are detected based on observed workload
characteristics. For example, the expected speed of AI/ML training steps on
particular hardware. Remediation for those issues is highly depend on a workload needs.
## Roadmap
As outlined in a section above, Kubernetes offers a lot of extension points
which are used to implement various DIY solutions. The space of AI/ML is
developing very fast, with changing requirements and usage patterns. SIG Node is
taking a measured approach of enabling more extension points to implement the
workload-specific scenarios over introduction of new semantics to support
specific scenarios. This means prioritizing making information about failures
readily available over implementing automatic remediations for those failures
that might only be suitable for a subset of workloads.
This approach ensures there are no drastic changes for workload handling which
may break existing, well-oiled DIY solutions or experiences with the existing
more traditional workloads.
Many error handling techniques used today work for AI/ML, but are very
expensive. SIG Node will invest in extension points to make those cheaper, with
the understanding that the price cutting for AI/ML is critical.
The following is the set of specific investments we envision for various failure
modes.
### Roadmap for failure modes: K8s infrastructure
The area of Kubernetes infrastructure is the easiest to understand and very
important to make right for the upcoming transition from Device Plugins to DRA.
SIG Node is tracking many work items in this area, most notably the following:
* [integrate kubelet with the systemd watchdog · Issue
#127460](https://github.com/kubernetes/kubernetes/issues/127460)
* [DRA: detect stale DRA plugin sockets · Issue
#128696](https://github.com/kubernetes/kubernetes/issues/128696)
* [Support takeover for devicemanager/device-plugin · Issue
#127803](https://github.com/kubernetes/kubernetes/issues/127803)
* [Kubelet plugin registration reliability · Issue
#127457](https://github.com/kubernetes/kubernetes/issues/127457)
* [Recreate the Device Manager gRPC server if failed · Issue
#128167](https://github.com/kubernetes/kubernetes/issues/128167)
* [Retry pod admission on device plugin grpc failures · Issue
#128043](https://github.com/kubernetes/kubernetes/issues/128043)
Basically, every interaction of Kubernetes components must be reliable via
either the kubelet improvements or the best practices in plugins development
and deployment.
### Roadmap for failure modes: device failed
For the device failures some patterns are already emerging in common scenarios
that Kubernetes can support. However, the very first step is to make information
about failed devices available easier. The very first step here is the work in
[KEP 4680](https://kep.k8s.io/4680) (Add Resource Health Status to the Pod Status for
Device Plugin and DRA).
Longer term ideas include to be tested:
* Integrate device failures into Pod Failure Policy.
* Node-local retry policies, enabling pod failure policies for Pods with
restartPolicy=OnFailure and possibly beyond that.
* Ability to _deschedule_ pod, including with the `restartPolicy: Always`, so it can
get a new device allocated.
* Add device health to the ResourceSlice used to represent devices in DRA,
rather than simply withdrawing an unhealthy device from the ResourceSlice.
### Roadmap for failure modes: container code failed
The main improvements to handle container code failures for AI/ML workloads are
all targeting cheaper error handling and recovery. The cheapness is mostly
coming from reuse of pre-allocated resources as much as possible. From reusing
the Pods by restarting containers in-place, to node local restart of containers
instead of rescheduling whenever possible, to snapshotting support, and
re-scheduling prioritizing the same node to save on image pulls.
Consider this scenario: A big training job needs 512 Pods to run. And one of the
pods failed. It means that all Pods need to be interrupted and synced up to
restart the failed step. The most efficient way to achieve this generally is to
reuse as many Pods as possible by restarting them in-place, while replacing the
failed pod to clear up the error from it. Like demonstrated in this picture:
{{< figure src="inplace-pod-restarts.svg" alt="The picture shows 512 pod, most ot them are green and have a recycle sign next to them indicating that they can be reused, and one Pod drawn in red, and a new green replacement Pod next to it indicating that it needs to be replaced." >}}
It is possible to implement this scenario, but all solutions implementing it are
fragile due to lack of certain extension points in Kubernetes. Adding these
extension points to implement this scenario is on the Kubernetes roadmap.
### Roadmap for failure modes: device degradation
There is very little done in this area - there is no clear detection signal,
very limited troubleshooting tooling, and no built-in semantics to express the
"degraded" device on Kubernetes. There has been discussion of adding data on
device performance or degradation in the ResourceSlice used by DRA to represent
devices, but it is not yet clearly defined. There are also projects like
[node-healthcheck-operator](https://github.com/medik8s/node-healthcheck-operator)
that can be used for some scenarios.
We expect developments in this area from hardware vendors and cloud providers, and we expect to see mostly DIY
solutions in the near future. As more users get exposed to AI/ML workloads, this
is a space needing feedback on patterns used here.
## Join the conversation
The Kubernetes community encourages feedback and participation in shaping the
future of device failure handling. Join SIG Node and contribute to the ongoing
discussions!
This blog post provides a high-level overview of the challenges and future
directions for device failure management in Kubernetes. By addressing these
issues, Kubernetes can solidify its position as the leading platform for AI/ML
workloads, ensuring resilience and reliability for applications that depend on
specialized hardware.

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 30 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 72 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 194 KiB

View File

@ -1,216 +0,0 @@
---
layout: blog
title: "Post-Quantum Cryptography in Kubernetes"
slug: pqc-in-k8s
date: 2025-07-18
canonicalUrl: https://www.kubernetes.dev/blog/2025/07/18/pqc-in-k8s/
author: "Fabian Kammel (ControlPlane)"
draft: false
---
The world of cryptography is on the cusp of a major shift with the advent of
quantum computing. While powerful quantum computers are still largely
theoretical for many applications, their potential to break current
cryptographic standards is a serious concern, especially for long-lived
systems. This is where _Post-Quantum Cryptography_ (PQC) comes in. In this
article, I\'ll dive into what PQC means for TLS and, more specifically, for the
Kubernetes ecosystem. I'll explain what the (suprising) state of PQC in
Kubernetes is and what the implications are for current and future clusters.
## What is Post-Quantum Cryptography
Post-Quantum Cryptography refers to cryptographic algorithms that are thought to
be secure against attacks by both classical and quantum computers. The primary
concern is that quantum computers, using algorithms like [Shor\'s Algorithm],
could efficiently break widely used public-key cryptosystems such as RSA and
Elliptic Curve Cryptography (ECC), which underpin much of today\'s secure
communication, including TLS. The industry is actively working on standardizing
and adopting PQC algorithms. One of the first to be standardized by [NIST] is
the Module-Lattice Key Encapsulation Mechanism (`ML-KEM`), formerly known as
Kyber, and now standardized as [FIPS\-203] (PDF download).
It is difficult to predict when quantum computers will be able to break
classical algorithms. However, it is clear that we need to start migrating to
PQC algorithms now, as the next section shows. To get a feeling for the
predicted timeline we can look at a [NIST report] covering the transition to
post-quantum cryptography standards. It declares that system with classical
crypto should be deprecated after 2030 and disallowed after 2035.
## Key exchange vs. digital signatures: different needs, different timelines {#timelines}
In TLS, there are two main cryptographic operations we need to secure:
**Key Exchange**: This is how the client and server agree on a shared secret to
encrypt their communication. If an attacker records encrypted traffic today,
they could decrypt it in the future, if they gain access to a quantum computer
capable of breaking the key exchange. This makes migrating KEMs to PQC an
immediate priority.
**Digital Signatures**: These are primarily used to authenticate the server (and
sometimes the client) via certificates. The authenticity of a server is
verified at the time of connection. While important, the risk of an attack
today is much lower, because the decision of trusting a server cannot be abused
after the fact. Additionally, current PQC signature schemes often come with
significant computational overhead and larger key/signature sizes compared to
their classical counterparts.
Another significant hurdle in the migration to PQ certificates is the upgrade
of root certificates. These certificates have long validity periods and are
installed in many devices and operating systems as trust anchors.
Given these differences, the focus for immediate PQC adoption in TLS has been
on hybrid key exchange mechanisms. These combine a classical algorithm (such as
Elliptic Curve Diffie-Hellman Ephemeral (ECDHE)) with a PQC algorithm (such as
`ML-KEM`). The resulting shared secret is secure as long as at least one of the
component algorithms remains unbroken. The `X25519MLKEM768` hybrid scheme is the
most widely supported one.
## State of PQC key exchange mechanisms (KEMs) today {#state-of-kems}
Support for PQC KEMs is rapidly improving across the ecosystem.
**Go**: The Go standard library\'s `crypto/tls` package introduced support for
`X25519MLKEM768` in version 1.24 (released February 2025). Crucially, it\'s
enabled by default when there is no explicit configuration, i.e.,
`Config.CurvePreferences` is `nil`.
**Browsers & OpenSSL**: Major browsers like Chrome (version 131, November 2024)
and Firefox (version 135, February 2025), as well as OpenSSL (version 3.5.0,
April 2025), have also added support for the `ML-KEM` based hybrid scheme.
Apple is also [rolling out support][ApplePQC] for `X25519MLKEM768` in version
26 of their operating systems. Given the proliferation of Apple devices, this
will have a significant impact on the global PQC adoption.
For a more detailed overview of the state of PQC in the wider industry,
see [this blog post by Cloudflare][PQC2024].
## Post-quantum KEMs in Kubernetes: an unexpected arrival
So, what does this mean for Kubernetes? Kubernetes components, including the
API server and kubelet, are built with Go.
As of Kubernetes v1.33, released in April 2025, the project uses Go 1.24. A
quick check of the Kubernetes codebase reveals that `Config.CurvePreferences`
is not explicitly set. This leads to a fascinating conclusion: Kubernetes
v1.33, by virtue of using Go 1.24, supports hybrid post-quantum
`X25519MLKEM768` for TLS connections by default!
You can test this yourself. If you set up a Minikube cluster running Kubernetes
v1.33.0, you can connect to the API server using a recent OpenSSL client:
```console
$ minikube start --kubernetes-version=v1.33.0
$ kubectl cluster-info
Kubernetes control plane is running at https://127.0.0.1:<PORT>
$ kubectl config view --minify --raw -o jsonpath=\'{.clusters[0].cluster.certificate-authority-data}\' | base64 -d > ca.crt
$ openssl version
OpenSSL 3.5.0 8 Apr 2025 (Library: OpenSSL 3.5.0 8 Apr 2025)
$ echo -n "Q" | openssl s_client -connect 127.0.0.1:<PORT> -CAfile ca.crt
[...]
Negotiated TLS1.3 group: X25519MLKEM768
[...]
DONE
```
Lo and behold, the negotiated group is `X25519MLKEM768`! This is a significant
step towards making Kubernetes quantum-safe, seemingly without a major
announcement or dedicated KEP (Kubernetes Enhancement Proposal).
## The Go version mismatch pitfall
An interesting wrinkle emerged with Go versions 1.23 and 1.24. Go 1.23
included experimental support for a draft version of `ML-KEM`, identified as
`X25519Kyber768Draft00`. This was also enabled by default if
`Config.CurvePreferences` was `nil`. Kubernetes v1.32 used Go 1.23. However,
Go 1.24 removed the draft support and replaced it with the standardized version
`X25519MLKEM768`.
What happens if a client and server are using mismatched Go versions (one on
1.23, the other on 1.24)? They won\'t have a common PQC KEM to negotiate, and
the handshake will fall back to classical ECC curves (e.g., `X25519`). How
could this happen in practice?
Consider a scenario:
A Kubernetes cluster is running v1.32 (using Go 1.23 and thus
`X25519Kyber768Draft00`). A developer upgrades their `kubectl` to v1.33,
compiled with Go 1.24, only supporting `X25519MLKEM768`. Now, when `kubectl`
communicates with the v1.32 API server, they no longer share a common PQC
algorithm. The connection will downgrade to classical cryptography, silently
losing the PQC protection that has been in place. This highlights the
importance of understanding the implications of Go version upgrades, and the
details of the TLS stack.
## Limitations: packet size {#limitation-packet-size}
One practical consideration with `ML-KEM` is the size of its public keys
with encoded key sizes of around 1.2 kilobytes for `ML-KEM-768`.
This can cause the initial TLS `ClientHello` message not to fit inside
a single TCP/IP packet, given the typical networking constraints
(most commonly, the standard Ethernet frame size limit of 1500
bytes). Some TLS libraries or network appliances might not handle this
gracefully, assuming the Client Hello always fits in one packet. This issue
has been observed in some Kubernetes-related projects and networking
components, potentially leading to connection failures when PQC KEMs are used.
More details can be found at [tldr.fail].
## State of Post-Quantum Signatures
While KEMs are seeing broader adoption, PQC digital signatures are further
behind in terms of widespread integration into standard toolchains. NIST has
published standards for PQC signatures, such as `ML-DSA` (`FIPS-204`) and
`SLH-DSA` (`FIPS-205`). However, implementing these in a way that\'s broadly
usable (e.g., for PQC Certificate Authorities) [presents challenges]:
**Larger Keys and Signatures**: PQC signature schemes often have significantly
larger public keys and signature sizes compared to classical algorithms like
Ed25519 or RSA. For instance, Dilithium2 keys can be 30 times larger than
Ed25519 keys, and certificates can be 12 times larger.
**Performance**: Signing and verification operations [can be substantially slower].
While some algorithms are on par with classical algorithms, others may have a
much higher overhead, sometimes on the order of 10x to 1000x worse performance.
To improve this situation, NIST is running a
[second round of standardization][NIST2ndRound] for PQC signatures.
**Toolchain Support**: Mainstream TLS libraries and CA software do not yet have
mature, built-in support for these new signature algorithms. The Go team, for
example, has indicated that `ML-DSA` support is a high priority, but the
soonest it might appear in the standard library is Go 1.26 [(as of May 2025)].
[Cloudflare\'s CIRCL] (Cloudflare Interoperable Reusable Cryptographic Library)
library implements some PQC signature schemes like variants of Dilithium, and
they maintain a [fork of Go (cfgo)] that integrates CIRCL. Using `cfgo`, it\'s
possible to experiment with generating certificates signed with PQC algorithms
like Ed25519-Dilithium2. However, this requires using a custom Go toolchain and
is not yet part of the mainstream Kubernetes or Go distributions.
## Conclusion
The journey to a post-quantum secure Kubernetes is underway, and perhaps
further along than many realize, thanks to the proactive adoption of `ML-KEM`
in Go. With Kubernetes v1.33, users are already benefiting from hybrid post-quantum key
exchange in many TLS connections by default.
However, awareness of potential pitfalls, such as Go version mismatches leading
to downgrades and issues with Client Hello packet sizes, is crucial. While PQC
for KEMs is becoming a reality, PQC for digital signatures and certificate
hierarchies is still in earlier stages of development and adoption for
mainstream use. As Kubernetes maintainers and contributors, staying informed
about these developments will be key to ensuring the long-term security of the
platform.
[Shor\'s Algorithm]: https://en.wikipedia.org/wiki/Shor%27s_algorithm
[NIST]: https://www.nist.gov/
[FIPS\-203]: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.203.pdf
[NIST report]: https://nvlpubs.nist.gov/nistpubs/ir/2024/NIST.IR.8547.ipd.pdf
[tldr.fail]: https://tldr.fail/
[presents challenges]: https://blog.cloudflare.com/another-look-at-pq-signatures/#the-algorithms
[can be substantially slower]: https://pqshield.github.io/nist-sigs-zoo/
[(as of May 2025)]: https://github.com/golang/go/issues/64537#issuecomment-2877714729
[Cloudflare\'s CIRCL]: https://github.com/cloudflare/circl
[fork of Go (cfgo)]: https://github.com/cloudflare/go
[PQC2024]: https://blog.cloudflare.com/pq-2024/
[NIST2ndRound]: https://csrc.nist.gov/news/2024/pqc-digital-signature-second-round-announcement
[ApplePQC]: https://support.apple.com/en-lb/122756

View File

@ -1,8 +1,9 @@
---
layout: blog
title: "Kubernetes v1.33: Continuing the transition from Endpoints to EndpointSlices"
title: "Continuing the transition from Endpoints to EndpointSlices"
slug: endpoints-deprecation
date: 2025-04-24T10:30:00-08:00
date: 2025-XX-XX
draft: true
author: >
Dan Winship (Red Hat)
---
@ -21,7 +22,7 @@ As of Kubernetes 1.33, the Endpoints API is now officially deprecated,
and the API server will return warnings to users who read or write
Endpoints resources rather than using EndpointSlices.
Eventually, the plan (as documented in [KEP-4974]) is to change the
Eventually, the plan (as documented in [KEP-4794]) is to change the
[Kubernetes Conformance] criteria to no longer require that clusters
run the _Endpoints controller_ (which generates Endpoints objects
based on Services and Pods), to avoid doing work that is unneeded in

View File

@ -1,9 +1,14 @@
---
layout: blog
title: "Kubernetes v1.33: HorizontalPodAutoscaler Configurable Tolerance"
slug: kubernetes-v1-33-hpa-configurable-tolerance
slug: kubernetes-1-33-hpa-configurable-tolerance
# after the v1.33 release, set a future publication date and remove the draft marker
# the release comms team can confirm which date has been assigned
#
# PRs to remove the draft marker should be opened BEFORE release day
draft: true
math: true # for formulae
date: 2025-04-28T10:30:00-08:00
date: XXXX-XX-XX
author: "Jean-Marc François (Google)"
---
@ -18,7 +23,7 @@ automatically resize by adding or removing replicas based on resource
utilization.
Let's say you have a web application running in a Kubernetes cluster with 50
replicas. You configure the HorizontalPodAutoscaler (HPA) to scale based on
replicas. You configure the Horizontal Pod Autoscaler (HPA) to scale based on
CPU utilization, with a target of 75% utilization. Now, imagine that the current
CPU utilization across all replicas is 90%, which is higher than the desired
75%. The HPA will calculate the required number of replicas using the formula:

View File

@ -1,12 +1,13 @@
---
layout: blog
title: "Kubernetes v1.33: In-Place Pod Resize Graduated to Beta"
slug: kubernetes-v1-33-in-place-pod-resize-beta
date: 2025-05-16T10:30:00-08:00
title: "In-Place Pod Resize Graduating to Beta"
slug: in-place-pod-resize-beta
draft: true
date: XXXX-XX-XX
author: "Tim Allclair (Google)"
---
On behalf of the Kubernetes project, I am excited to announce that the **in-place Pod resize** feature (also known as In-Place Pod Vertical Scaling), first introduced as alpha in Kubernetes v1.27, has graduated to **Beta** and will be enabled by default in the Kubernetes v1.33 release! This marks a significant milestone in making resource management for Kubernetes workloads more flexible and less disruptive.
On behalf of the Kubernetes project, I am excited to announce that the **in-place Pod resize** feature (also known as In-Place Pod Vertical Scaling), first introduced as alpha in Kubernetes v1.27, is graduating to **Beta** and will be enabled by default in the Kubernetes v1.33 release! This marks a significant milestone in making resource management for Kubernetes workloads more flexible and less disruptive.
## What is in-place Pod resize?

View File

@ -1,23 +0,0 @@
---
title: Careers
bigheader: Careers in Kubernetes
abstract: Jobs focused on Kubernetes and Cloud Native Patterns
class: gridPage
cid: careers
body_class: careers
menu:
main:
weight: 70
---
<div class="d-flex flex-column justify-content-center mt-4 mt-md-5 px-2 px-md-3 px-lg-0">
<iframe id="gitjobs" class="mx-auto" src="https://gitjobs.dev/embed?ts_query=kubernetes" style="width:100%;max-width:870px;height:100%;display:block;border:none;"></iframe>
<div class="mb-4 mb-md-5 mt-1 mx-auto gitjobs-legend">
Powered by <a href="https://gitjobs.dev" target="_blank">GitJobs</a>
</div>
</div>
<script type="module">
import { initialize } from "https://cdn.jsdelivr.net/npm/@open-iframe-resizer/core@latest/dist/index.js";
initialize({}, "#gitjobs");
</script>

View File

@ -27,7 +27,7 @@ case_study_details:
<p>"Every single product, every decision we make at Ancestry, focuses on delighting our customers with intimate, sometimes life-changing discoveries about themselves and their families," says MacKay. "As the company continues to grow, the increased productivity gains from using Kubernetes has helped Ancestry make customer discoveries faster. With the move to Dockerization for example, instead of taking between 20 to 50 minutes to deploy a new piece of code, we can now deploy in under a minute for much of our code. We've truly experienced significant time savings in addition to the various features and benefits from cloud native and Kubernetes-type technologies."</p>
{{< case-studies/quote author="PAUL MACKAY, SOFTWARE ENGINEER AND ARCHITECT AT ANCESTRY" >}}
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
"At a certain point, you have to step back if you're going to push a new technology and get key thought leaders with engineers within the organization to become your champions for new technology adoption. At training sessions, the development teams were always the ones that were saying, 'Kubernetes saved our time tremendously; it's an enabler. It really is incredible.'"
{{< /case-studies/quote >}}
{{< case-studies/lead >}}
@ -48,7 +48,7 @@ It started with a Shaky Leaf.
<p>That need led them in 2015 to explore containerization. Ancestry engineers had already been using technology like <a href="https://www.java.com/en/">Java</a> and <a href="https://www.python.org">Python</a> on Linux, so part of the decision was about making the infrastructure more Linux-friendly. They quickly decided that they wanted to go with Docker for containerization, "but it always comes down to the orchestration part of it to make it really work," says MacKay.</p>
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="https://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
<p>His team looked at orchestration platforms offered by <a href="https://docs.docker.com/compose/">Docker Compose</a>, <a href="http://mesos.apache.org">Mesos</a> and <a href="https://www.openstack.org/software/">OpenStack</a>, and even started to prototype some homegrown solutions. And then they started hearing rumblings of the imminent release of Kubernetes v1.0. "At the forefront, we were looking at the secret store, so we didn't have to manage that all ourselves, the config maps, the methodology of seamless deployment strategy," he says. "We found that how Kubernetes had done their resources, their types, their labels and just their interface was so much further advanced than the other things we had seen. It was a feature fit."</p>
{{< case-studies/lead >}}
Plus, MacKay says, "I just believed in the confidence that comes with the history that Google has with containerization. So we started out right on the leading edge of it. And we haven't looked back since."

View File

@ -42,9 +42,9 @@ With its end-to-end commerce platform for cloud-based products and services, <a
<p>When Director of Software Development Pierre-Alexandre Lacerte started working there in 2014, the company had a monolith application deployed on a "tomcat infrastructure, and the whole release process was complex for what it should be," he says. "There were a lot of manual steps involved, with one engineer building a feature then creating a pull request, and a QA or another engineer validating the feature. Then it gets merged and someone else will take care of the deployment. So we had bottlenecks in the pipeline to ship a feature to production."</p>
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="https://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
<p>At the same time, the engineering team of 40 was growing, and the company wanted to add an increasing number of features to its products. As a member of the platform team, Lacerte began hearing from multiple teams that wanted to deploy applications using different frameworks and languages, from <a href="https://nodejs.org/">Node.js</a> to <a href="http://spring.io/projects/spring-boot">Spring Boot Java</a>. He soon realized that in order to both support growth and increase velocity, the company needed a better infrastructure, and a system in which teams are autonomous, can do their own deploys, and be responsible for their services in production.</p>
{{< case-studies/quote
{{< case-studies/quote
image="/images/case-studies/appdirect/banner3.jpg"
author="Alexandre Gervais, Staff Software Developer, AppDirect"
>}}
@ -61,7 +61,7 @@ With its end-to-end commerce platform for cloud-based products and services, <a
<p>Lacerte's strategy ultimately worked because of the very real impact the Kubernetes platform has had to deployment time. Due to less dependency on custom-made, brittle shell scripts with SCP commands, time to deploy a new version has shrunk from 4 hours to a few minutes. Additionally, the company invested a lot of effort to make things self-service for developers. "Onboarding a new service doesn't require <a href="https://www.atlassian.com/software/jira">Jira</a> tickets or meeting with three different teams," says Lacerte. Today, the company sees 1,600 deployments per week, compared to 1-30 before.</p>
{{< case-studies/quote
{{< case-studies/quote
image="/images/case-studies/appdirect/banner4.jpg"
author="Pierre-Alexandre Lacerte, Director of Software Development, AppDirect"
>}}

View File

@ -20,7 +20,7 @@ case_study_details:
<h2>Solution</h2>
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="https://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
<p>Opting not to shift to cloud virtualization or use a private cloud on their own servers, the BlaBlaCar team became early adopters of containerization, using the CoreOs runtime <a href="https://coreos.com/rkt">rkt</a>, initially deployed using <a href="https://coreos.com/fleet/docs/latest/launching-containers-fleet.html">fleet</a> cluster manager. Last year, the company switched to <a href="http://kubernetes.io/">Kubernetes</a> orchestration, and now also uses <a href="https://prometheus.io/">Prometheus</a> for monitoring.</p>
<h2>Impact</h2>

View File

@ -20,7 +20,7 @@ case_study_details:
<h2>Solution</h2>
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="https://kubernetes.io/">Kubernetes.</a></p>
<p>Turning to microservices and containerization, GolfNow began moving its applications and databases from third-party services to its own clusters running on <a href="https://www.docker.com/">Docker</a> and <a href="http://kubernetes.io/">Kubernetes.</a></p>
<h2>Impact</h2>
@ -50,7 +50,7 @@ It's not every day that you can say you've slashed an operating expense by half.
<p>GolfNow's dev team ran an "internal, low-key" proof of concept and were won over. "We really liked how easy it was to be able to pass containers around to each other and have them up and running in no time, exactly the way it was running on my machine," says Sheriff. "Because that is always the biggest gripe that Ops has with developers, right? 'It worked on my machine!' But then we started getting to the point of, 'How do we make sure that these things stay up and running?'"</p>
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="https://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
<p>That led the team on a quest to find the right orchestration system for the company's needs. Sheriff says the first few options they tried were either too heavy or "didn't feel quite right." In late summer 2015, they discovered the just-released <a href="http://kubernetes.io/">Kubernetes</a>, which Sheriff immediately liked for its ease of use. "We did another proof of concept," he says, "and Kubernetes won because of the fact that the community backing was there, built on top of what Google had already done."</p>
<p>But before they could go with Kubernetes, <a href="http://www.nbc.com/">NBC</a>, GolfNow's parent company, also asked them to comparison shop with another company. Sheriff and his team liked the competing company's platform user interface, but didn't like that its platform would not allow containers to run natively on Docker. With no clear decision in sight, Sheriff's VP at GolfNow, Steve McElwee, set up a three-month trial during which a GolfNow team (consisting of Sheriff and Josh, who's now Lead Architect, Open Platforms) would build out a Kubernetes environment, and a large NBC team would build out one with the other company's platform.</p>

View File

@ -53,9 +53,6 @@ menu:
<div class="community-nav-item">
<a href="/releases">Releases</a>
</div>
<div class="community-nav-item">
<a href="/case-studies">Case Studies</a>
</div>
</div>
<div class="community-section" id="gallery">
@ -127,12 +124,12 @@ menu:
troubleshooting, and so much&nbsp;more.</p>
</div>
<div id="bluesky" class="community-resource">
<a href="https://bsky.app/profile/kubernetes.io">
<img src="/images/community/bluesky.png" alt="Bluesky">
<div id="twitter" class="community-resource">
<a href="https://twitter.com/kubernetesio">
<img src="/images/community/x-org.png" alt="𝕏.org">
</a>
<a href="https://bsky.app/profile/kubernetes.io">Bluesky&nbsp;&#9654;</a>
<p><em>@kubernetes.io</em></p>
<a href="https://twitter.com/kubernetesio">𝕏&nbsp;&#9654;</a>
<p><em>#kubernetesio</em></p>
<p>Real-time announcements of blog posts, events, news, ideas.</p>
</div>
@ -162,15 +159,6 @@ menu:
Visit <a href="https://slack.k8s.io/">https://slack.k8s.io/</a>
for an invitation.</details>
</div>
<div id="twitter" class="community-resource">
<a href="https://x.com/kubernetesio">
<img src="/images/community/x-org.png" alt="X">
</a>
<a href="https://x.com/kubernetesio">𝕏&nbsp;&#9654;</a>
<p><em>@kubernetesio</em></p>
<p>Real-time announcements of blog posts, events, news, ideas.</p>
</div>
</div>
</div>

View File

@ -26,22 +26,25 @@ each Node in your cluster, so that the
The kubelet acts as a client when connecting to the container runtime via gRPC.
The runtime and image service endpoints have to be available in the container
runtime, which can be configured separately within the kubelet by using the
`--container-runtime-endpoint`
[command line flag](/docs/reference/command-line-tools-reference/kubelet/).
`--image-service-endpoint` [command line flags](/docs/reference/command-line-tools-reference/kubelet).
For Kubernetes v1.26 and later, the kubelet requires that the container runtime
supports the `v1` CRI API. If a container runtime does not support the `v1` API,
the kubelet will not register the node.
For Kubernetes v{{< skew currentVersion >}}, the kubelet prefers to use CRI `v1`.
If a container runtime does not support `v1` of the CRI, then the kubelet tries to
negotiate any older supported version.
The v{{< skew currentVersion >}} kubelet can also negotiate CRI `v1alpha2`, but
this version is considered as deprecated.
If the kubelet cannot negotiate a supported CRI version, the kubelet gives up
and doesn't register as a node.
## Upgrading
When upgrading the Kubernetes version on a node, the kubelet restarts. If the
container runtime does not support the `v1` CRI API, the kubelet will fail to
register and report an error. If a gRPC re-dial is required because the container
runtime has been upgraded, the runtime must support the `v1` CRI API for the
connection to succeed. This might require a restart of the kubelet after the
container runtime is correctly configured.
When upgrading Kubernetes, the kubelet tries to automatically select the
latest CRI version on restart of the component. If that fails, then the fallback
will take place as mentioned above. If a gRPC re-dial was required because the
container runtime has been upgraded, then the container runtime must also
support the initially selected version or the redial is expected to fail. This
requires a restart of the kubelet.
## {{% heading "whatsnext" %}}
- Learn more about the CRI [protocol definition](https://github.com/kubernetes/cri-api/blob/v0.33.1/pkg/apis/runtime/v1/api.proto)
- Learn more about the CRI [protocol definition](https://github.com/kubernetes/cri-api/blob/c75ef5b/pkg/apis/runtime/v1/api.proto)

View File

@ -296,6 +296,63 @@ the kubelet can use topology hints when making resource assignment decisions.
See [Control Topology Management Policies on a Node](/docs/tasks/administer-cluster/topology-manager/)
for more information.
## Swap memory management {#swap-memory}
{{< feature-state feature_gate_name="NodeSwap" >}}
To enable swap on a node, the `NodeSwap` feature gate must be enabled on
the kubelet (default is true), and the `--fail-swap-on` command line flag or `failSwapOn`
[configuration setting](/docs/reference/config-api/kubelet-config.v1beta1/)
must be set to false.
To allow Pods to utilize swap, `swapBehavior` should not be set to `NoSwap` (which is the default behavior) in the kubelet config.
{{< warning >}}
When the memory swap feature is turned on, Kubernetes data such as the content
of Secret objects that were written to tmpfs now could be swapped to disk.
{{< /warning >}}
A user can also optionally configure `memorySwap.swapBehavior` in order to
specify how a node will use swap memory. For example,
```yaml
memorySwap:
swapBehavior: LimitedSwap
```
- `NoSwap` (default): Kubernetes workloads will not use swap.
- `LimitedSwap`: The utilization of swap memory by Kubernetes workloads is subject to limitations.
Only Pods of Burstable QoS are permitted to employ swap.
If configuration for `memorySwap` is not specified and the feature gate is
enabled, by default the kubelet will apply the same behaviour as the
`NoSwap` setting.
With `LimitedSwap`, Pods that do not fall under the Burstable QoS classification (i.e.
`BestEffort`/`Guaranteed` Qos Pods) are prohibited from utilizing swap memory.
To maintain the aforementioned security and node health guarantees, these Pods
are not permitted to use swap memory when `LimitedSwap` is in effect.
Prior to detailing the calculation of the swap limit, it is necessary to define the following terms:
* `nodeTotalMemory`: The total amount of physical memory available on the node.
* `totalPodsSwapAvailable`: The total amount of swap memory on the node that is available for use by Pods
(some swap memory may be reserved for system use).
* `containerMemoryRequest`: The container's memory request.
Swap limitation is configured as:
`(containerMemoryRequest / nodeTotalMemory) * totalPodsSwapAvailable`.
It is important to note that, for containers within Burstable QoS Pods, it is possible to
opt-out of swap usage by specifying memory requests that are equal to memory limits.
Containers configured in this manner will not have access to swap memory.
Swap is supported only with **cgroup v2**, cgroup v1 is not supported.
For more information, and to assist with testing and provide feedback, please
see the blog-post about [Kubernetes 1.28: NodeSwap graduates to Beta1](/blog/2023/08/24/swap-linux-beta/),
[KEP-2400](https://github.com/kubernetes/enhancements/issues/4128) and its
[design proposal](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md).
## {{% heading "whatsnext" %}}
Learn more about the following:

View File

@ -1,14 +1,7 @@
---
title: Kubernetes Self-Healing
content_type: concept
weight: 50
feature:
title: Self-healing
anchor: Automated recovery from damage
description: >
Kubernetes restarts containers that crash, replaces entire Pods where needed,
reattaches storage in response to wider failures, and can integrate with
node autoscalers to self-heal even at the node level.
Weight: 50
---
<!-- overview -->

View File

@ -82,9 +82,6 @@ installation instructions. The list does not try to be exhaustive.
* [Spiderpool](https://github.com/spidernet-io/spiderpool) is an underlay and RDMA
networking solution for Kubernetes. Spiderpool is supported on bare metal, virtual machines,
and public cloud environments.
* [Terway](https://github.com/AliyunContainerService/terway/) is a suite of CNI plugins
based on AlibabaCloud's VPC and ECS network products. It provides native VPC networking
and network policies in AlibabaCloud environments.
* [Weave Net](https://github.com/rajch/weave#using-weave-on-kubernetes)
provides networking and network policy, will carry on working on both sides
of a network partition, and does not require an external database.

View File

@ -1,402 +0,0 @@
---
title: Swap memory management
content_type: concept
weight: 10
---
<!-- overview -->
Kubernetes can be configured to use swap memory on a {{< glossary_tooltip text="node" term_id="node" >}},
allowing the kernel to free up physical memory by swapping out pages to backing storage.
This is useful for multiple use-cases.
For example, nodes running workloads that can benefit from using swap,
such as those that have large memory footprints but only access a portion of that memory at any given time.
It also helps prevent Pods from being terminated during memory pressure spikes,
shields nodes from system-level memory spikes that might compromise its stability,
allows for more flexible memory management on the node, and much more.
<!-- body -->
## How to use it?
### Prerequisites
- Swap must be enabled and provisioned on the node.
- The node must run a Linux operating system.
- The node must use cgroup v2. Kubernetes does not support swap on cgroup v1 nodes.
## Enabling swap for Kubernetes Workloads
To allow Kubernetes workloads to use swap,
you must disable the kubelet's default behavior of failing when swap is detected,
and specify memory-swap behavior as `LimitedSwap`:
**Update kubelet configuration:**
```yaml
# this fragment goes into the kubelet's configuration file
failSwapOn: false
memorySwap:
swapBehavior: LimitedSwap
```
The available choices for `swapBehavior` are:
- `NoSwap` (default): Kubernetes workloads cannot use swap. However, processes
outside of Kubernetes' scope, like system daemons (such as kubelet itself!) can utilize swap.
This behavior is beneficial for protecting the node from system-level memory spikes,
but it does not safeguard the workloads themselves from such spikes.
- `LimitedSwap`: Kubernetes workloads can utilize swap memory.
The amount of swap available to a Pod is determined automatically.
For more details, see the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap).
If configuration for `memorySwap` is not specified,
by default the kubelet will apply the same behaviour as the `NoSwap` setting.
Bear in mind that the following pods would be excluded from swap access
(see more info in the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap)):
- Pods that are not classified as Burstable QoS.
- Pods of High-priority.
- Containers with memory limit that equals to memory request.
{{< note >}}
Kubernetes only supports swap for Linux nodes.
{{< /note >}}
## How does it work?
There are a number of possible ways that one could envision swap use on a node.
If kubelet is already running on a node, it would need to be restarted after swap is provisioned in order to identify it.
When kubelet starts on a node in which swap is provisioned and available
(with the `failSwapOn: false` configuration), kubelet will:
- Be able to start on this swap-enabled node.
- Direct the Container Runtime Interface (CRI) implementation, often referred to as the container runtime,
to allocate zero swap memory to Kubernetes workloads by default.
Swap configuration on a node is exposed to a cluster admin via the
[`memorySwap` in the KubeletConfiguration](/docs/reference/config-api/kubelet-config.v1).
As a cluster administrator, you can specify the node's behaviour in the
presence of swap memory by setting `memorySwap.swapBehavior`.
The kubelet uses the container runtime API, and directs the container runtime to
apply specific configuration (for example, in the cgroup v2 case, `memory.swap.max`) in a manner that will
enable the desired swap configuration for a container. For runtimes that use control groups, or cgroups,
the container runtime is then responsible for writing these settings to the container-level cgroup.
## Observability for swap use
### Node and container level metric statistics
Kubelet now collects node and container level metric statistics,
which can be accessed at the `/metrics/resource` (which is used mainly by monitoring
tools like Prometheus) and `/stats/summary` (which is used mainly by Autoscalers) kubelet HTTP endpoints.
This allows clients who can directly request the kubelet to
monitor swap usage and remaining swap memory when using `LimitedSwap`.
Additionally, a `machine_swap_bytes` metric has been added to cadvisor to show
the total physical swap capacity of the machine.
See [this page](/docs/reference/instrumentation/node-metrics/) for more info.
For example, these `/metrics/resource` are supported:
- `node_swap_usage_bytes`: Current swap usage of the node in bytes.
- `container_swap_usage_bytes`: Current amount of the container swap usage in bytes.
- `container_swap_limit_bytes`: Current amount of the container swap limit in bytes.
### Using `kubectl top --show-swap`
Querying metrics is valuable, but somewhat cumbersome, as these metrics
are designed to be used by software rather than humans.
In order to consume this data in a more user-friendly way,
the `kubectl top` command has been extended to support swap metrics, using the `--show-swap` flag.
In order to receive information about swap usage on nodes, `kubectl top nodes --show-swap` can be used:
```shell
kubectl top nodes --show-swap
```
This will result in an output similar to:
```
NAME CPU(cores) CPU(%) MEMORY(bytes) MEMORY(%) SWAP(bytes) SWAP(%)
node1 1m 10% 2Mi 10% 1Mi 0%
node2 5m 10% 6Mi 10% 2Mi 0%
node3 3m 10% 4Mi 10% <unknown> <unknown>
```
In order to receive information about swap usage by pods, `kubectl top nodes --show-swap` can be used:
```shell
kubectl top pod -n kube-system --show-swap
```
This will result in an output similar to:
```
NAME CPU(cores) MEMORY(bytes) SWAP(bytes)
coredns-58d5bc5cdb-5nbk4 2m 19Mi 0Mi
coredns-58d5bc5cdb-jsh26 3m 37Mi 0Mi
etcd-node01 51m 143Mi 5Mi
kube-apiserver-node01 98m 824Mi 16Mi
kube-controller-manager-node01 20m 135Mi 9Mi
kube-proxy-ffgs2 1m 24Mi 0Mi
kube-proxy-fhvwx 1m 39Mi 0Mi
kube-scheduler-node01 13m 69Mi 0Mi
metrics-server-8598789fdb-d2kcj 5m 26Mi 0Mi
```
### Nodes to report swap capacity as part of node status
A new node status field is now added, `node.status.nodeInfo.swap.capacity`, to report the swap capacity of a node.
As an example, the following command can be used to retrieve the swap capacity of the nodes in a cluster:
```shell
kubectl get nodes -o go-template='{{range .items}}{{.metadata.name}}: {{if .status.nodeInfo.swap.capacity}}{{.status.nodeInfo.swap.capacity}}{{else}}<unknown>{{end}}{{"\n"}}{{end}}'
```
This will result in an output similar to:
```
node1: 21474836480
node2: 42949664768
node3: <unknown>
```
{{< note >}}
The `<unknown>` value indicates that the `.status.nodeInfo.swap.capacity` field is not set for that Node.
This probably means that the node does not have swap provisioned, or less likely,
that the kubelet is not able to determine the swap capacity of the node.
{{< /note >}}
### Swap discovery using Node Feature Discovery (NFD) {#node-feature-discovery}
[Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery)
is a Kubernetes addon for detecting hardware features and configuration.
It can be utilized to discover which nodes are provisioned with swap.
As an example, to figure out which nodes are provisioned with swap,
use the following command:
```shell
kubectl get nodes -o jsonpath='{range .items[?(@.metadata.labels.feature\.node\.kubernetes\.io/memory-swap)]}{.metadata.name}{"\t"}{.metadata.labels.feature\.node\.kubernetes\.io/memory-swap}{"\n"}{end}'
```
This will result in an output similar to:
```
k8s-worker1: true
k8s-worker2: true
k8s-worker3: false
```
In this example, swap is provisioned on nodes `k8s-worker1` and `k8s-worker2`, but not on `k8s-worker3`.
## Risks and caveats
{{< caution >}}
It is deeply encouraged to encrypt the swap space.
See Memory-backed volumes [memory-backed volumes](#memory-backed-volumes) for more info.
{{< /caution >}}
Having swap available on a system reduces predictability.
While swap can enhance performance by making more RAM available, swapping data
back to memory is a heavy operation, sometimes slower by many orders of magnitude,
which can cause unexpected performance regressions.
Furthermore, swap changes a system's behaviour under memory pressure.
Enabling swap increases the risk of noisy neighbors,
where Pods that frequently use their RAM may cause other Pods to swap.
In addition, since swap allows for greater memory usage for workloads in Kubernetes that cannot be predictably accounted for,
and due to unexpected packing configurations,
the scheduler currently does not account for swap memory usage.
This heightens the risk of noisy neighbors.
The performance of a node with swap memory enabled depends on the underlying physical storage.
When swap memory is in use, performance will be significantly worse in an I/O
operations per second (IOPS) constrained environment, such as a cloud VM with
I/O throttling, when compared to faster storage mediums like solid-state drives
or NVMe.
As swap might cause IO pressure, it is recommended to give a higher IO latency
priority to system critical daemons. See the relevant section in the
[recommended practices](#good-practice-for-using-swap-in-a-kubernetes-cluster) section below.
### Memory-backed volumes
On Linux nodes, memory-backed volumes (such as [`secret`](/docs/concepts/configuration/secret/)
volume mounts, or [`emptyDir`](/docs/concepts/storage/volumes/#emptydir) with `medium: Memory`)
are implemented with a `tmpfs` filesystem.
The contents of such volumes should remain in memory at all times, hence should
not be swapped to disk.
To ensure the contents of such volumes remain in memory, the `noswap` tmpfs option
is being used.
The Linux kernel officially supports the `noswap` option from version 6.3 (more info
can be found in [Linux Kernel Version Requirements](/docs/reference/node/kernel-version-requirements/#requirements-other)).
However, the different distributions often choose to backport this mount option to older
Linux versions as well.
In order to verify whether the node supports the `noswap` option, the kubelet will do the following:
* If the kernel's version is above 6.3 then the `noswap` option will be assumed to be supported.
* Otherwise, kubelet would try to mount a dummy tmpfs with the `noswap` option at startup.
If kubelet fails with an error indicating of an unknown option, `noswap` will be assumed
to not be supported, hence will not be used.
A kubelet log entry will be emitted to warn the user about memory-backed volumes might swap to disk.
If kubelet succeeds, the dummy tmpfs will be deleted and the `noswap` option will be used.
* If the `noswap` option is not supported, kubelet will emit a warning log entry,
then continue its execution.
See the [section above](#setting-up-encrypted-swap) with an example for setting unencrypted swap.
However, handling encrypted swap is not within the scope of kubelet;
rather, it is a general OS configuration concern and should be addressed at that level.
It is the administrator's responsibility to provision encrypted swap to mitigate this risk.
### Evictions
Configuring memory eviction thresholds for swap-enabled nodes can be tricky.
With swap being disabled, it is reasonable to configure kubelet's eviction thresholds
to be a bit lower than the node's memory capacity.
The rationale is that we want Kubernetes to start evicting Pods before the node runs out of memory
and invokes the Out Of Memory (OOM) killer, since the OOM killer is not Kubernetes-aware,
therefore does not consider things like QoS, pod priority, or other Kubernetes-specific factors.
With swap enabled, the situation is more complex.
In Linux, the `vm.min_free_kbytes` parameter defines the memory threshold for the kernel
to start aggressively reclaiming memory, which includes swapping out pages.
If the kubelet's eviction thresholds are set in a way that eviction would take place
before the kernel starts reclaiming memory, it could lead to workloads never
being able to swap out during node memory pressure.
However, setting the eviction thresholds too high could result in the node running out of memory
and invoking the OOM killer, which is not ideal either.
To address this, it is recommended to set the kubelet's eviction thresholds
to be slightly lower than the `vm.min_free_kbytes` value.
This way, the node can start swapping before kubelet would start evicting Pods,
allowing workloads to swap out unused data and preventing evictions from happening.
On the other hand, since it is just slightly lower, kubelet is likely to start evicting Pods
before the node runs out of memory, thus avoiding the OOM killer.
The value of `vm.min_free_kbytes` can be determined by running the following command on the node:
```shell
cat /proc/sys/vm/min_free_kbytes
```
### Unutilized swap space
Under the `LimitedSwap` behavior, the amount of swap available to a Pod is determined automatically,
based on the proportion of the memory requested relative to the node's total memory
(For more details, see the [section below](#how-is-the-swap-limit-being-determined-with-limitedswap)).
This design means that usually there would be some portion of swap that will remain
restricted for Kubernetes workloads.
For example, since Guaranteed QoS pods are currently not permitted to use swap,
the amount of swap that's proportional to the memory request will remain unused
by Kubernetes workloads.
This behavior carries some risk in a situation where many pods are not eligible for swapping.
On the other hand, it effectively keeps some system-reserved amount of swap memory that can be used by processes
outside of Kubernetes' scope, such as system daemons and even kubelet itself.
## Good practice for using swap in a Kubernetes cluster
### Disable swap for system-critical daemons
During the testing phase and based on user feedback, it was observed that the performance
of system-critical daemons and services might degrade.
This implies that system daemons, including the kubelet, could operate slower than usual.
If this issue is encountered, it is advisable to configure the cgroup of the system slice
to prevent swapping (i.e., set `memory.swap.max=0`).
### Protect system-critical daemons for I/O latency
Swap can increase the I/O load on a node.
When memory pressure causes the kernel to rapidly swap pages in and out,
system-critical daemons and services that rely on I/O operations may
experience performance degradation.
To mitigate this, it is recommended for systemd users to prioritize the system slice in terms of I/O latency.
For non-systemd users,
setting up a dedicated cgroup for system daemons and processes and prioritizing I/O latency in the same way is advised.
This can be achieved by setting `io.latency` for the system slice,
thereby granting it higher I/O priority.
See [cgroup's documentation](https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v2.rst) for more info.
### Swap and control plane nodes
The Kubernetes project recommends running control plane nodes without any swap space configured.
The control plane primarily hosts Guaranteed QoS Pods, so swap can generally be disabled.
The main concern is that swapping critical services on the control plane could negatively impact performance.
### Use of a dedicated disk for swap
The Kubernetes project recommends using encrypted swap, whenever you run nodes with swap enabled.
If swap resides on a partition or the root filesystem, workloads may interfere
with system processes that need to write to disk.
When they share the same disk, processes can overwhelm swap,
disrupting the I/O of kubelet, container runtime, and systemd, which would impact other workloads.
Since swap space is located on a disk, it is crucial to ensure the disk is fast enough for the intended use cases.
Alternatively, one can configure I/O priorities between different mapped areas of a single backing device.
### Swap-aware scheduling
Kubernetes {{< skew currentVersion >}} does not support allocating Pods to nodes in a way that accounts
for swap memory usage. The scheduler typically uses _requests_ for infrastructure resources
to guide Pod placement, and Pods do not request swap space; they just request `memory`.
This means that the scheduler does not consider swap memory when making scheduling decisions.
While this is something we are actively working on, it is not yet implemented.
In order for administrators to ensure that Pods are not scheduled on nodes
with swap memory unless they are specifically intended to use it,
Administrators can taint nodes with swap available to protect against this problem.
Taints will ensure that workloads which tolerate swap will not spill onto nodes without swap under load.
### Selecting storage for optimal performance
The storage device designated for swap space is critical to maintaining system responsiveness
during high memory usage.
Rotational hard disk drives (HDDs) are ill-suited for this task as their mechanical nature introduces significant latency,
leading to severe performance degradation and system thrashing.
For modern performance needs, a device such as a Solid State Drive (SSD) is probably the appropriate choice for swap,
as its low-latency electronic access minimizes the slowdown.
## Swap Behavior Details
### How is the swap limit being determined with LimitedSwap?
The configuration of swap memory, including its limitations, presents a significant
challenge. Not only is it prone to misconfiguration, but as a system-level property, any
misconfiguration could potentially compromise the entire node rather than just a specific
workload. To mitigate this risk and ensure the health of the node, we have implemented
Swap with automatic configuration of limitations.
With `LimitedSwap`, Pods that do not fall under the Burstable QoS classification (i.e.
`BestEffort`/`Guaranteed` QoS Pods) are prohibited from utilizing swap memory.
`BestEffort` QoS Pods exhibit unpredictable memory consumption patterns and lack
information regarding their memory usage, making it difficult to determine a safe
allocation of swap memory.
Conversely, `Guaranteed` QoS Pods are typically employed for applications that rely on the
precise allocation of resources specified by the workload, with memory being immediately available.
To maintain the aforementioned security and node health guarantees,
these Pods are not permitted to use swap memory when `LimitedSwap` is in effect.
In addition, high-priority pods are not permitted to use swap in order to ensure the memory
they consume always residents on disk, hence ready to use.
Prior to detailing the calculation of the swap limit, it is necessary to define the following terms:
* `nodeTotalMemory`: The total amount of physical memory available on the node.
* `totalPodsSwapAvailable`: The total amount of swap memory on the node that is available for use by Pods (some swap memory may be reserved for system use).
* `containerMemoryRequest`: The container's memory request.
Swap limitation is configured as:
( `containerMemoryRequest` / `nodeTotalMemory` ) × `totalPodsSwapAvailable`
In other words, the amount of swap that a container is able to use is proportionate to its
memory request, the node's total physical memory and the total amount of swap memory on
the node that is available for use by Pods.
It is important to note that, for containers within Burstable QoS Pods, it is possible to
opt-out of swap usage by specifying memory requests that are equal to memory limits.
Containers configured in this manner will not have access to swap memory.
## {{% heading "whatsnext" %}}
- You can check out a [blog post about Kubernetes and swap](/blog/2025/03/25/swap-linux-improvements/)
- For more information, please see the original KEP, [KEP-2400](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2400-node-swap),
and its [design](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md).

View File

@ -41,14 +41,13 @@ receivers:
grpc:
exporters:
# Replace this exporter with the exporter for your backend
exporters:
debug:
verbosity: detailed
logging:
logLevel: debug
service:
pipelines:
traces:
receivers: [otlp]
exporters: [debug]
exporters: [logging]
```
To directly emit traces to a backend without utilizing a collector,

View File

@ -12,7 +12,7 @@ hide_summary: true # Listed separately in section index
A container image represents binary data that encapsulates an application and all its
software dependencies. Container images are executable software bundles that can run
standalone and that make very well-defined assumptions about their runtime environment.
standalone and that make very well defined assumptions about their runtime environment.
You typically create a container image of your application and push it to a registry
before referring to it in a {{< glossary_tooltip text="Pod" term_id="pod" >}}.
@ -34,7 +34,7 @@ Images can also include a registry hostname; for example: `fictional.registry.ex
and possibly a port number as well; for example: `fictional.registry.example:10443/imagename`.
If you don't specify a registry hostname, Kubernetes assumes that you mean the [Docker public registry](https://hub.docker.com/).
You can change this behavior by setting a default image registry in the
You can change this behaviour by setting default image registry in
[container runtime](/docs/setup/production-environment/container-runtimes/) configuration.
After the image name part you can add a _tag_ or _digest_ (in the same way you would when using with commands
@ -43,45 +43,40 @@ Digests are a unique identifier for a specific version of an image. Digests are
and are immutable. Tags can be moved to point to different images, but digests are fixed.
Image tags consist of lowercase and uppercase letters, digits, underscores (`_`),
periods (`.`), and dashes (`-`). A tag can be up to 128 characters long, and must
conform to the following regex pattern: `[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}`.
You can read more about it and find the validation regex in the
periods (`.`), and dashes (`-`). It can be up to 128 characters long. And must follow the
next regex pattern: `[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}`
You can read more about and find validation regex in the
[OCI Distribution Specification](https://github.com/opencontainers/distribution-spec/blob/master/spec.md#workflow-categories).
If you don't specify a tag, Kubernetes assumes you mean the tag `latest`.
Image digests consists of a hash algorithm (such as `sha256`) and a hash value. For example:
`sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07`.
You can find more information about the digest format in the
`sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07`
You can find more information about digests format in the
[OCI Image Specification](https://github.com/opencontainers/image-spec/blob/master/descriptor.md#digests).
Some image name examples that Kubernetes can use are:
- `busybox` &mdash; Image name only, no tag or digest. Kubernetes will use the Docker
public registry and latest tag. Equivalent to `docker.io/library/busybox:latest`.
- `busybox:1.32.0` &mdash; Image name with tag. Kubernetes will use the Docker
public registry. Equivalent to `docker.io/library/busybox:1.32.0`.
- `registry.k8s.io/pause:latest` &mdash; Image name with a custom registry and latest tag.
- `registry.k8s.io/pause:3.5` &mdash; Image name with a custom registry and non-latest tag.
- `registry.k8s.io/pause@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` &mdash;
Image name with digest.
- `registry.k8s.io/pause:3.5@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` &mdash;
Image name with tag and digest. Only the digest will be used for pulling.
- `busybox` - Image name only, no tag or digest. Kubernetes will use Docker public registry and latest tag. (Same as `docker.io/library/busybox:latest`)
- `busybox:1.32.0` - Image name with tag. Kubernetes will use Docker public registry. (Same as `docker.io/library/busybox:1.32.0`)
- `registry.k8s.io/pause:latest` - Image name with a custom registry and latest tag.
- `registry.k8s.io/pause:3.5` - Image name with a custom registry and non-latest tag.
- `registry.k8s.io/pause@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` - Image name with digest.
- `registry.k8s.io/pause:3.5@sha256:1ff6c18fbef2045af6b9c16bf034cc421a29027b800e4f9b68ae9b1cb3e9ae07` - Image name with tag and digest. Only digest will be used for pulling.
## Updating images
When you first create a {{< glossary_tooltip text="Deployment" term_id="deployment" >}},
{{< glossary_tooltip text="StatefulSet" term_id="statefulset" >}}, Pod, or other
object that includes a PodTemplate, and a pull policy was not explicitly specified,
then by default the pull policy of all containers in that Pod will be set to
`IfNotPresent`. This policy causes the
object that includes a Pod template, then by default the pull policy of all
containers in that pod will be set to `IfNotPresent` if it is not explicitly
specified. This policy causes the
{{< glossary_tooltip text="kubelet" term_id="kubelet" >}} to skip pulling an
image if it already exists.
### Image pull policy
The `imagePullPolicy` for a container and the tag of the image both affect _when_ the
[kubelet](/docs/reference/command-line-tools-reference/kubelet/) attempts to pull
(download) the specified image.
The `imagePullPolicy` for a container and the tag of the image affect when the
[kubelet](/docs/reference/command-line-tools-reference/kubelet/) attempts to pull (download) the specified image.
Here's a list of the values you can set for `imagePullPolicy` and the effects
these values have:
@ -124,12 +119,12 @@ When using image tags, if the image registry were to change the code that the ta
represents, you might end up with a mix of Pods running the old and new code. An image digest
uniquely identifies a specific version of the image, so Kubernetes runs the same code every time
it starts a container with that image name and digest specified. Specifying an image by digest
pins the code that you run so that a change at the registry cannot lead to that mix of versions.
fixes the code that you run so that a change at the registry cannot lead to that mix of versions.
There are third-party [admission controllers](/docs/reference/access-authn-authz/admission-controllers/)
that mutate Pods (and PodTemplates) when they are created, so that the
that mutate Pods (and pod templates) when they are created, so that the
running workload is defined based on an image digest rather than a tag.
That might be useful if you want to make sure that your entire workload is
That might be useful if you want to make sure that all your workload is
running the same code no matter what tag changes happen at the registry.
#### Default image pull policy {#imagepullpolicy-defaulting}
@ -140,11 +135,11 @@ When you (or a controller) submit a new Pod to the API server, your cluster sets
- if you omit the `imagePullPolicy` field, and you specify the digest for the
container image, the `imagePullPolicy` is automatically set to `IfNotPresent`.
- if you omit the `imagePullPolicy` field, and the tag for the container image is
`:latest`, `imagePullPolicy` is automatically set to `Always`.
`:latest`, `imagePullPolicy` is automatically set to `Always`;
- if you omit the `imagePullPolicy` field, and you don't specify the tag for the
container image, `imagePullPolicy` is automatically set to `Always`.
- if you omit the `imagePullPolicy` field, and you specify a tag for the container
image that isn't `:latest`, the `imagePullPolicy` is automatically set to
container image, `imagePullPolicy` is automatically set to `Always`;
- if you omit the `imagePullPolicy` field, and you specify the tag for the
container image that isn't `:latest`, the `imagePullPolicy` is automatically set to
`IfNotPresent`.
{{< note >}}
@ -189,18 +184,16 @@ which is 300 seconds (5 minutes).
Kubernetes includes alpha support for performing image pulls based on the RuntimeClass of a Pod.
If you enable the `RuntimeClassInImageCriApi` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/),
the kubelet references container images by a tuple of image name and runtime handler
rather than just the image name or digest. Your
{{< glossary_tooltip text="container runtime" term_id="container-runtime" >}}
the kubelet references container images by a tuple of (image name, runtime handler) rather than just the
image name or digest. Your {{< glossary_tooltip text="container runtime" term_id="container-runtime" >}}
may adapt its behavior based on the selected runtime handler.
Pulling images based on runtime class is useful for VM-based containers, such as
Windows Hyper-V containers.
Pulling images based on runtime class will be helpful for VM based containers like windows hyperV containers.
## Serial and parallel image pulls
By default, the kubelet pulls images serially. In other words, the kubelet sends
only one image pull request to the image service at a time. Other image pull
requests have to wait until the one being processed is complete.
By default, kubelet pulls images serially. In other words, kubelet sends only
one image pull request to the image service at a time. Other image pull requests
have to wait until the one being processed is complete.
Nodes make image pull decisions in isolation. Even when you use serialized image
pulls, two different nodes can pull the same image in parallel.
@ -210,91 +203,72 @@ If you would like to enable parallel image pulls, you can set the field
With `serializeImagePulls` set to false, image pull requests will be sent to the image service immediately,
and multiple images will be pulled at the same time.
When enabling parallel image pulls, ensure that the image service of your container
runtime can handle parallel image pulls.
When enabling parallel image pulls, please make sure the image service of your
container runtime can handle parallel image pulls.
The kubelet never pulls multiple images in parallel on behalf of one Pod. For example,
if you have a Pod that has an init container and an application container, the image
pulls for the two containers will not be parallelized. However, if you have two
Pods that use different images, and the parallel image pull feature is enabled,
the kubelet will pull the images in parallel on behalf of the two different Pods.
Pods that use different images, the kubelet pulls the images in parallel on
behalf of the two different Pods, when parallel image pulls is enabled.
### Maximum parallel image pulls
{{< feature-state for_k8s_version="v1.32" state="beta" >}}
When `serializeImagePulls` is set to false, the kubelet defaults to no limit on
the maximum number of images being pulled at the same time. If you would like to
When `serializeImagePulls` is set to false, the kubelet defaults to no limit on the
maximum number of images being pulled at the same time. If you would like to
limit the number of parallel image pulls, you can set the field `maxParallelImagePulls`
in the kubelet configuration. With `maxParallelImagePulls` set to _n_, only _n_
images can be pulled at the same time, and any image pull beyond _n_ will have to
wait until at least one ongoing image pull is complete.
in kubelet configuration. With `maxParallelImagePulls` set to _n_, only _n_ images
can be pulled at the same time, and any image pull beyond _n_ will have to wait
until at least one ongoing image pull is complete.
Limiting the number of parallel image pulls prevents image pulling from consuming
Limiting the number parallel image pulls would prevent image pulling from consuming
too much network bandwidth or disk I/O, when parallel image pulling is enabled.
You can set `maxParallelImagePulls` to a positive number that is greater than or
equal to 1. If you set `maxParallelImagePulls` to be greater than or equal to 2,
you must set `serializeImagePulls` to false. The kubelet will fail to start
with an invalid `maxParallelImagePulls` setting.
equal to 1. If you set `maxParallelImagePulls` to be greater than or equal to 2, you
must set the `serializeImagePulls` to false. The kubelet will fail to start with invalid
`maxParallelImagePulls` settings.
## Multi-architecture images with image indexes
As well as providing binary images, a container registry can also serve a
[container image index](https://github.com/opencontainers/image-spec/blob/master/image-index.md).
An image index can point to multiple [image manifests](https://github.com/opencontainers/image-spec/blob/master/manifest.md)
for architecture-specific versions of a container. The idea is that you can have
a name for an image (for example: `pause`, `example/mycontainer`, `kube-apiserver`)
and allow different systems to fetch the right binary image for the machine
architecture they are using.
for architecture-specific versions of a container. The idea is that you can have a name for an image
(for example: `pause`, `example/mycontainer`, `kube-apiserver`) and allow different systems to
fetch the right binary image for the machine architecture they are using.
The Kubernetes project typically creates container images for its releases with
names that include the suffix `-$(ARCH)`. For backward compatibility, generate
older images with suffixes. For instance, an image named as `pause` would be a
multi-architecture image containing manifests for all supported architectures,
while `pause-amd64` would be a backward-compatible version for older configurations,
or for YAML files with hardcoded image names containing suffixes.
Kubernetes itself typically names container images with a suffix `-$(ARCH)`. For backward
compatibility, please generate the older images with suffixes. The idea is to generate say `pause`
image which has the manifest for all the arch(es) and say `pause-amd64` which is backwards
compatible for older configurations or YAML files which may have hard coded the images with
suffixes.
## Using a private registry
Private registries may require authentication to be able to discover and/or pull
images from them.
Private registries may require keys to read images from them.
Credentials can be provided in several ways:
- [Specifying `imagePullSecrets` when you define a Pod](#specifying-imagepullsecrets-on-a-pod)
Only Pods which provide their own keys can access the private registry.
- [Configuring Nodes to Authenticate to a Private Registry](#configuring-nodes-to-authenticate-to-a-private-registry)
- All Pods can read any configured private registries.
- Requires node configuration by cluster administrator.
- Using a _kubelet credential provider_ plugin to [dynamically fetch credentials for private registries](#kubelet-credential-provider)
The kubelet can be configured to use credential provider exec plugin for the
respective private registry.
- [Pre-pulled Images](#pre-pulled-images)
- All Pods can use any images cached on a node.
- Requires root access to all nodes to set up.
- Configuring Nodes to Authenticate to a Private Registry
- all pods can read any configured private registries
- requires node configuration by cluster administrator
- Kubelet Credential Provider to dynamically fetch credentials for private registries
- kubelet can be configured to use credential provider exec plugin
for the respective private registry.
- Pre-pulled Images
- all pods can use any images cached on a node
- requires root access to all nodes to set up
- Specifying ImagePullSecrets on a Pod
- only pods which provide their own keys can access the private registry
- Vendor-specific or local extensions
If you're using a custom node configuration, you (or your cloud provider) can
implement your mechanism for authenticating the node to the container registry.
- if you're using a custom node configuration, you (or your cloud
provider) can implement your mechanism for authenticating the node
to the container registry.
These options are explained in more detail below.
### Specifying `imagePullSecrets` on a Pod
{{< note >}}
This is the recommended approach to run containers based on images
in private registries.
{{< /note >}}
Kubernetes supports specifying container image registry keys on a Pod.
All `imagePullSecrets` must be Secrets that exist in the same
{{< glossary_tooltip term_id="namespace" >}} as the
Pod. These Secrets must be of type `kubernetes.io/dockercfg` or `kubernetes.io/dockerconfigjson`.
### Configuring nodes to authenticate to a private registry
Specific instructions for setting credentials depends on the container runtime and registry you
@ -306,17 +280,13 @@ task. That example uses a private registry in Docker Hub.
### Kubelet credential provider for authenticated image pulls {#kubelet-credential-provider}
You can configure the kubelet to invoke a plugin binary to dynamically fetch
registry credentials for a container image. This is the most robust and versatile
way to fetch credentials for private registries, but also requires kubelet-level
configuration to enable.
{{< note >}}
This approach is especially suitable when kubelet needs to fetch registry credentials dynamically.
Most commonly used for registries provided by cloud providers where auth tokens are short-lived.
{{< /note >}}
This technique can be especially useful for running {{< glossary_tooltip term_id="static-pod" text="static Pods" >}}
that require container images hosted in a private registry.
Using a {{< glossary_tooltip term_id="service-account" >}} or a
{{< glossary_tooltip term_id="secret" >}} to provide private registry credentials
is not possible in the specification of a static Pod, because it _cannot_
have references to other API resources in its specification.
You can configure the kubelet to invoke a plugin binary to dynamically fetch registry credentials for a container image.
This is the most robust and versatile way to fetch credentials for private registries, but also requires kubelet-level configuration to enable.
See [Configure a kubelet image credential provider](/docs/tasks/administer-cluster/kubelet-credential-provider/) for more details.
@ -329,57 +299,55 @@ prefix-matched paths. The only limitation is that glob patterns (`*`) have to
include the dot (`.`) for each subdomain. The amount of matched subdomains has
to be equal to the amount of glob patterns (`*.`), for example:
- `*.kubernetes.io` will *not* match `kubernetes.io`, but will match
`abc.kubernetes.io`.
- `*.*.kubernetes.io` will *not* match `abc.kubernetes.io`, but will match
`abc.def.kubernetes.io`.
- `prefix.*.io` will match `prefix.kubernetes.io`.
- `*-good.kubernetes.io` will match `prefix-good.kubernetes.io`.
- `*.kubernetes.io` will *not* match `kubernetes.io`, but `abc.kubernetes.io`
- `*.*.kubernetes.io` will *not* match `abc.kubernetes.io`, but `abc.def.kubernetes.io`
- `prefix.*.io` will match `prefix.kubernetes.io`
- `*-good.kubernetes.io` will match `prefix-good.kubernetes.io`
This means that a `config.json` like this is valid:
```json
{
"auths": {
"my-registry.example/images": { "auth": "…" },
"*.my-registry.example/images": { "auth": "…" }
"my-registry.io/images": { "auth": "…" },
"*.my-registry.io/images": { "auth": "…" }
}
}
```
Image pull operations pass the credentials to the CRI container runtime for every
valid pattern. For example, the following container image names would match
successfully:
Image pull operations would now pass the credentials to the CRI container
runtime for every valid pattern. For example the following container image names
would match successfully:
- `my-registry.example/images`
- `my-registry.example/images/my-image`
- `my-registry.example/images/another-image`
- `sub.my-registry.example/images/my-image`
- `my-registry.io/images`
- `my-registry.io/images/my-image`
- `my-registry.io/images/another-image`
- `sub.my-registry.io/images/my-image`
However, these container image names would *not* match:
But not:
- `a.sub.my-registry.example/images/my-image`
- `a.b.sub.my-registry.example/images/my-image`
- `a.sub.my-registry.io/images/my-image`
- `a.b.sub.my-registry.io/images/my-image`
The kubelet performs image pulls sequentially for every found credential. This
means that multiple entries in `config.json` for different paths are possible, too:
means, that multiple entries in `config.json` for different paths are possible, too:
```json
{
"auths": {
"my-registry.example/images": {
"my-registry.io/images": {
"auth": "…"
},
"my-registry.example/images/subpath": {
"my-registry.io/images/subpath": {
"auth": "…"
}
}
}
```
If now a container specifies an image `my-registry.example/images/subpath/my-image`
to be pulled, then the kubelet will try to download it using both authentication
sources if one of them fails.
If now a container specifies an image `my-registry.io/images/subpath/my-image`
to be pulled, then the kubelet will try to download them from both
authentication sources if one of them fails.
### Pre-pulled images
@ -396,34 +364,40 @@ then a local image is used (preferentially or exclusively, respectively).
If you want to rely on pre-pulled images as a substitute for registry authentication,
you must ensure all nodes in the cluster have the same pre-pulled images.
This can be used to preload certain images for speed or as an alternative to
authenticating to a private registry.
Similar to the usage of the [kubelet credential provider](#kubelet-credential-provider),
pre-pulled images are also suitable for launching
{{< glossary_tooltip text="static Pods" term_id="static-pod" >}} that depend
on images hosted in a private registry.
This can be used to preload certain images for speed or as an alternative to authenticating to a
private registry.
{{< note >}}
{{< feature-state feature_gate_name="KubeletEnsureSecretPulledImages" >}}
Access to pre-pulled images may be authorized according to [image pull credential verification](#ensureimagepullcredentialverification).
Access to pre-pulled images may be authorized according to [image pull credential verification](#ensureimagepullcredentialverification)
{{< /note >}}
#### Ensure image pull credential verification {#ensureimagepullcredentialverification}
### Specifying imagePullSecrets on a Pod
{{< note >}}
This is the recommended approach to run containers based on images
in private registries.
{{< /note >}}
Kubernetes supports specifying container image registry keys on a Pod.
`imagePullSecrets` must all be in the same namespace as the Pod. The referenced
Secrets must be of type `kubernetes.io/dockercfg` or `kubernetes.io/dockerconfigjson`.
#### Ensure Image Pull Credential Verification {#ensureimagepullcredentialverification}
{{< feature-state feature_gate_name="KubeletEnsureSecretPulledImages" >}}
If the `KubeletEnsureSecretPulledImages` feature gate is enabled for your cluster,
Kubernetes will validate image credentials for every image that requires credentials
to be pulled, even if that image is already present on the node. This validation
ensures that images in a Pod request which have not been successfully pulled
If the `KubeletEnsureSecretPulledImages` feature gate is enabled, Kubernetes will validate
image credentials for every image that requires credentials to be pulled,
even if that image is already present on the node.
This validation ensures that images in a pod request which have not been successfully pulled
with the provided credentials must re-pull the images from the registry.
Additionally, image pulls that re-use the same credentials
which previously resulted in a successful image pull will not need to re-pull from
the registry and are instead validated locally without accessing the registry
which previously resulted in a successful image pull will not need to re-pull from the registry
and are instead validated locally without accessing the registry
(provided the image is available locally).
This is controlled by the`imagePullCredentialsVerificationPolicy` field in the
[Kubelet configuration](/docs/reference/config-api/kubelet-config.v1beta1/#kubelet-config-k8s-io-v1beta1-ImagePullCredentialsVerificationPolicy).
[Kubelet configuration](/docs/reference/config-api/kubelet-config.v1beta1#ImagePullCredentialsVerificationPolicy).
This configuration controls when image pull credentials must be verified if the
image is already present on the node:
@ -432,13 +406,13 @@ image is already present on the node:
If the image is present locally, image pull credentials are not verified.
* `NeverVerifyPreloadedImages`: Images pulled outside the kubelet are not verified,
but all other images will have their credentials verified. This is the default behavior.
* `NeverVerifyAllowListedImages`: Images pulled outside the kubelet and mentioned within the
* `NeverVerifyAllowListedImages`: Images pulled outside the kubelet and mentioned within the
`preloadedImagesVerificationAllowlist` specified in the kubelet config are not verified.
* `AlwaysVerify`: All images will have their credentials verified
before they can be used.
This verification applies to [pre-pulled images](#pre-pulled-images),
images pulled using node-wide secrets, and images pulled using Pod-level secrets.
images pulled using node-wide secrets, and images pulled using pod-level secrets.
{{< note >}}
In the case of credential rotation, the credentials previously used to pull the image
@ -450,19 +424,19 @@ will require the image to be re-pulled from the registry.
You need to know the username, registry password and client email address for authenticating
to the registry, as well as its hostname.
Run the following command, substituting placeholders with the appropriate values:
Run the following command, substituting the appropriate uppercase values:
```shell
kubectl create secret docker-registry <name> \
--docker-server=<docker-registry-server> \
--docker-username=<docker-user> \
--docker-password=<docker-password> \
--docker-email=<docker-email>
--docker-server=DOCKER_REGISTRY_SERVER \
--docker-username=DOCKER_USER \
--docker-password=DOCKER_PASSWORD \
--docker-email=DOCKER_EMAIL
```
If you already have a Docker credentials file then, rather than using the above
command, you can import the credentials file as a Kubernetes
{{< glossary_tooltip text="Secret" term_id="secret" >}}.
{{< glossary_tooltip text="Secrets" term_id="secret" >}}.
[Create a Secret based on existing Docker credentials](/docs/tasks/configure-pod-container/pull-image-private-registry/#registry-secret-existing-credentials)
explains how to set this up.
@ -475,11 +449,11 @@ Pods can only reference image pull secrets in their own namespace,
so this process needs to be done one time per namespace.
{{< /note >}}
#### Referring to `imagePullSecrets` on a Pod
#### Referring to an imagePullSecrets on a Pod
Now, you can create pods which reference that secret by adding the `imagePullSecrets`
Now, you can create pods which reference that secret by adding an `imagePullSecrets`
section to a Pod definition. Each item in the `imagePullSecrets` array can only
reference one Secret in the same namespace.
reference a Secret in the same namespace.
For example:
@ -504,14 +478,15 @@ resources:
EOF
```
This needs to be done for each Pod that is using a private registry.
This needs to be done for each pod that is using a private registry.
However, you can automate this process by specifying the `imagePullSecrets` section
in a [ServiceAccount](/docs/tasks/configure-pod-container/configure-service-account/)
resource. See [Add ImagePullSecrets to a Service Account](/docs/tasks/configure-pod-container/configure-service-account/#add-imagepullsecrets-to-a-service-account)
However, setting of this field can be automated by setting the imagePullSecrets
in a [ServiceAccount](/docs/tasks/configure-pod-container/configure-service-account/) resource.
Check [Add ImagePullSecrets to a Service Account](/docs/tasks/configure-pod-container/configure-service-account/#add-imagepullsecrets-to-a-service-account)
for detailed instructions.
You can use this in conjunction with a per-node `.docker/config.json`. The credentials
You can use this in conjunction with a per-node `.docker/config.json`. The credentials
will be merged.
## Use cases
@ -527,7 +502,7 @@ common use cases and suggested solutions.
1. Cluster running some proprietary images which should be hidden to those outside the company, but
visible to all cluster users.
- Use a hosted private registry
- Manual configuration may be required on the nodes that need to access to private registry.
- Manual configuration may be required on the nodes that need to access to private registry
- Or, run an internal private registry behind your firewall with open read access.
- No Kubernetes configuration is required.
- Use a hosted container image registry service that controls image access
@ -536,34 +511,33 @@ common use cases and suggested solutions.
1. Cluster with proprietary images, a few of which require stricter access control.
- Ensure [AlwaysPullImages admission controller](/docs/reference/access-authn-authz/admission-controllers/#alwayspullimages)
is active. Otherwise, all Pods potentially have access to all images.
- Move sensitive data into a Secret resource, instead of packaging it in an image.
- Move sensitive data into a "Secret" resource, instead of packaging it in an image.
1. A multi-tenant cluster where each tenant needs own private registry.
- Ensure [AlwaysPullImages admission controller](/docs/reference/access-authn-authz/admission-controllers/#alwayspullimages)
is active. Otherwise, all Pods of all tenants potentially have access to all images.
- Run a private registry with authorization required.
- Generate registry credentials for each tenant, store into a Secret, and propagate
the Secret to every tenant namespace.
- The tenant then adds that Secret to `imagePullSecrets` of each namespace.
- Generate registry credential for each tenant, put into secret, and populate secret to each
tenant namespace.
- The tenant adds that secret to imagePullSecrets of each namespace.
If you need access to multiple registries, you can create one Secret per registry.
If you need access to multiple registries, you can create one secret for each registry.
## Legacy built-in kubelet credential provider
In older versions of Kubernetes, the kubelet had a direct integration with cloud
provider credentials. This provided the ability to dynamically fetch credentials
for image registries.
In older versions of Kubernetes, the kubelet had a direct integration with cloud provider credentials.
This gave it the ability to dynamically fetch credentials for image registries.
There were three built-in implementations of the kubelet credential provider
integration: ACR (Azure Container Registry), ECR (Elastic Container Registry),
and GCR (Google Container Registry).
There were three built-in implementations of the kubelet credential provider integration:
ACR (Azure Container Registry), ECR (Elastic Container Registry), and GCR (Google Container Registry).
Starting with version 1.26 of Kubernetes, the legacy mechanism has been removed,
so you would need to either:
- configure a kubelet image credential provider on each node; or
- specify image pull credentials using `imagePullSecrets` and at least one Secret.
For more information on the legacy mechanism, read the documentation for the version of Kubernetes that you
are using. Kubernetes v1.26 through to v{{< skew latestVersion >}} do not include the legacy mechanism, so
you would need to either:
- configure a kubelet image credential provider on each node
- specify image pull credentials using `imagePullSecrets` and at least one Secret
## {{% heading "whatsnext" %}}
* Read the [OCI Image Manifest Specification](https://github.com/opencontainers/image-spec/blob/main/manifest.md).
* Read the [OCI Image Manifest Specification](https://github.com/opencontainers/image-spec/blob/master/manifest.md).
* Learn about [container image garbage collection](/docs/concepts/architecture/garbage-collection/#container-image-garbage-collection).
* Learn more about [pulling an Image from a Private Registry](/docs/tasks/configure-pod-container/pull-image-private-registry).

View File

@ -30,10 +30,10 @@ Here's a brief overview of the main components:
Manage the overall state of the cluster:
[kube-apiserver](/docs/concepts/architecture/#kube-apiserver)
: The core component server that exposes the Kubernetes HTTP API.
: The core component server that exposes the Kubernetes HTTP API
[etcd](/docs/concepts/architecture/#etcd)
: Consistent and highly-available key value store for all API server data.
: Consistent and highly-available key value store for all API server data
[kube-scheduler](/docs/concepts/architecture/#kube-scheduler)
: Looks for Pods not yet bound to a node, and assigns each Pod to a suitable node.
@ -68,16 +68,16 @@ run [systemd](https://systemd.io/) on a Linux node to supervise local components
Addons extend the functionality of Kubernetes. A few important examples include:
[DNS](/docs/concepts/architecture/#dns)
: For cluster-wide DNS resolution.
: For cluster-wide DNS resolution
[Web UI](/docs/concepts/architecture/#web-ui-dashboard) (Dashboard)
: For cluster management via a web interface.
: For cluster management via a web interface
[Container Resource Monitoring](/docs/concepts/architecture/#container-resource-monitoring)
: For collecting and storing container metrics.
: For collecting and storing container metrics
[Cluster-level Logging](/docs/concepts/architecture/#cluster-level-logging)
: For saving container logs to a central log store.
: For saving container logs to a central log store
## Flexibility in Architecture

View File

@ -55,11 +55,6 @@ object once it is set.
* After the deletion is requested, you can not resurrect this object. The only way is to delete it and make a new similar object.
{{</note>}}
{{<note>}}
Custom finalizer names **must** be publicly qualified finalizer names, such as `example.com/finalizer-name`.
Kubernetes enforces this format; the API server rejects writes to objects where the change does not use qualified finalizer names for any custom finalizer.
{{</note>}}
## Owner references, labels, and finalizers {#owners-labels-finalizers}
Like {{<glossary_tooltip text="labels" term_id="label">}},

View File

@ -14,57 +14,36 @@ weight: 20
When several users or teams share a cluster with a fixed number of nodes,
there is a concern that one team could use more than its fair share of resources.
_Resource quotas_ are a tool for administrators to address this concern.
A resource quota, defined by a ResourceQuota object, provides constraints that limit
aggregate resource consumption per {{< glossary_tooltip text="namespace" term_id="namespace" >}}. A ResourceQuota can also
limit the [quantity of objects that can be created in a namespace](#object-count-quota) by API kind, as well as the total
amount of {{< glossary_tooltip text="infrastructure resources" term_id="infrastructure-resource" >}} that may be consumed by
API objects found in that namespace.
{{< caution >}}
Neither contention nor changes to quota will affect already created resources.
{{< /caution >}}
Resource quotas are a tool for administrators to address this concern.
<!-- body -->
## How Kubernetes ResourceQuotas work
A resource quota, defined by a `ResourceQuota` object, provides constraints that limit
aggregate resource consumption per namespace. It can limit the quantity of objects that can
be created in a namespace by type, as well as the total amount of compute resources that may
be consumed by resources in that namespace.
ResourceQuotas work like this:
Resource quotas work like this:
- Different teams work in different namespaces. This separation can be enforced with
[RBAC](/docs/reference/access-authn-authz/rbac/) or any other [authorization](/docs/reference/access-authn-authz/authorization/)
mechanism.
- Different teams work in different namespaces. This can be enforced with
[RBAC](/docs/reference/access-authn-authz/rbac/).
- A cluster administrator creates at least one ResourceQuota for each namespace.
- To make sure the enforcement stays enforced, the cluster administrator should also restrict access to delete or update
that ResourceQuota; for example, by defining a [ValidatingAdmissionPolicy](/docs/reference/access-authn-authz/validating-admission-policy/).
- The administrator creates one ResourceQuota for each namespace.
- Users create resources (pods, services, etc.) in the namespace, and the quota system
tracks usage to ensure it does not exceed hard resource limits defined in a ResourceQuota.
You can apply a [scope](#quota-scopes) to a ResourceQuota to limit where it applies,
- If creating or updating a resource violates a quota constraint, the request will fail with HTTP
status code `403 FORBIDDEN` with a message explaining the constraint that would have been violated.
- If creating or updating a resource violates a quota constraint, the control plane rejects that request with HTTP
status code `403 Forbidden`. The error includes a message explaining the constraint that would have been violated.
- If quotas are enabled in a namespace for compute resources like `cpu` and `memory`, users must specify
requests or limits for those values; otherwise, the quota system may reject pod creation. Hint: Use
the `LimitRanger` admission controller to force defaults for pods that make no compute resource requirements.
- If quotas are enabled in a namespace for {{< glossary_tooltip text="resource" term_id="infrastructure-resource" >}}
such as `cpu` and `memory`, users must specify requests or limits for those values when they define a Pod; otherwise,
the quota system may reject pod creation.
The resource quota [walkthrough](/docs/tasks/administer-cluster/manage-resources/quota-memory-cpu-namespace/)
shows an example of how to avoid this problem.
See the [walkthrough](/docs/tasks/administer-cluster/manage-resources/quota-memory-cpu-namespace/)
for an example of how to avoid this problem.
{{< note >}}
* You can define a [LimitRange](/docs/concepts/policy/limit-range/)
to force defaults on pods that make no compute resource requirements (so that users don't have to remember to do that).
{{< /note >}}
You often do not create Pods directly; for example, you more usually create a [workload management](/docs/concepts/workloads/controllers/)
object such as a {{< glossary_tooltip term_id="deployment" >}}. If you create a Deployment that tries to use more
resources than are available, the creation of the Deployment (or other workload management object) **succeeds**, but
the Deployment may not be able to get all of the Pods it manages to exist. In that case you can check the status of
the Deployment, for example with `kubectl describe`, to see what has happened.
- For `cpu` and `memory` resources, ResourceQuotas enforce that **every**
(new) pod in that namespace sets a limit for that resource.
@ -80,6 +59,8 @@ the Deployment, for example with `kubectl describe`, to see what has happened.
You can use a [LimitRange](/docs/concepts/policy/limit-range/) to automatically set
a default request for these resources.
{{< /note >}}
The name of a ResourceQuota object must be a valid
[DNS subdomain name](/docs/concepts/overview/working-with-objects/names#dns-subdomain-names).
@ -93,6 +74,7 @@ Examples of policies that could be created using namespaces and quotas are:
In the case where the total capacity of the cluster is less than the sum of the quotas of the namespaces,
there may be contention for resources. This is handled on a first-come-first-served basis.
Neither contention nor changes to quota will affect already created resources.
## Enabling Resource Quota
@ -934,9 +916,8 @@ and it is to be created in a namespace other than `kube-system`.
## {{% heading "whatsnext" %}}
- See a [detailed example for how to use resource quota](/docs/tasks/administer-cluster/quota-api-object/).
- Read the ResourceQuota [API reference](/docs/reference/kubernetes-api/policy-resources/resource-quota-v1/)
- Learn about [LimitRanges](/docs/concepts/policy/limit-range/)
- You can read the historical [ResourceQuota design document](https://git.k8s.io/design-proposals-archive/resource-management/admission_control_resource_quota.md)
- See [ResourceQuota design document](https://git.k8s.io/design-proposals-archive/resource-management/admission_control_resource_quota.md)
for more information.
- You can also read the [Quota support for priority class design document](https://git.k8s.io/design-proposals-archive/scheduling/pod-priority-resourcequota.md).
- See a [detailed example for how to use resource quota](/docs/tasks/administer-cluster/quota-api-object/).
- Read [Quota support for priority class design document](https://git.k8s.io/design-proposals-archive/scheduling/pod-priority-resourcequota.md).
- See [LimitedResources](https://github.com/kubernetes/kubernetes/pull/36765).

View File

@ -1,10 +1,8 @@
---
reviewers:
- davidopp
- dom4ha
- kevin-wangzefeng
- macsko
- sanposhiho
- alculquicondor
title: Assigning Pods to Nodes
content_type: concept
weight: 20
@ -83,7 +81,7 @@ information.
## Affinity and anti-affinity
`nodeSelector` is the simplest way to constrain Pods to nodes with specific
labels. Affinity and anti-affinity expand the types of constraints you can
labels. Affinity and anti-affinity expands the types of constraints you can
define. Some of the benefits of affinity and anti-affinity include:
- The affinity/anti-affinity language is more expressive. `nodeSelector` only
@ -236,12 +234,10 @@ Pods, the default Kubernetes scheduler places those Pods and honors any
### Inter-pod affinity and anti-affinity
Inter-pod affinity and anti-affinity allow you to constrain which nodes your
Pods can be scheduled on based on the labels of Pods already running on that
Pods can be scheduled on based on the labels of **Pods** already running on that
node, instead of the node labels.
#### Types of Inter-pod Affinity and Anti-affinity
Inter-pod affinity and anti-affinity take the form "this
Inter-pod affinity and anti-affinity rules take the form "this
Pod should (or, in the case of anti-affinity, should not) run in an X if that X
is already running one or more Pods that meet rule Y", where X is a topology
domain like node, rack, cloud provider zone or region, or similar and Y is the
@ -261,14 +257,16 @@ the node label that the system uses to denote the domain. For examples, see
Inter-pod affinity and anti-affinity require substantial amounts of
processing which can slow down scheduling in large clusters significantly. We do
not recommend using them in clusters larger than several hundred nodes.
{{</note>}}
{{< /note >}}
{{< note >}}
Pod anti-affinity requires nodes to be consistently labeled, in other words,
every node in the cluster must have an appropriate label matching `topologyKey`.
If some or all nodes are missing the specified `topologyKey` label, it can lead
to unintended behavior.
{{</note>}}
{{< /note >}}
#### Types of inter-pod affinity and anti-affinity
Similar to [node affinity](#node-affinity) are two types of Pod affinity and
anti-affinity as follows:
@ -287,34 +285,16 @@ To use inter-pod affinity, use the `affinity.podAffinity` field in the Pod spec.
For inter-pod anti-affinity, use the `affinity.podAntiAffinity` field in the Pod
spec.
#### Scheduling Behavior
When scheduling a new Pod, the Kubernetes scheduler evaluates the Pod's affinity/anti-affinity rules in the context of the current cluster state:
1. Hard Constraints (Node Filtering):
- `podAffinity.requiredDuringSchedulingIgnoredDuringExecution` and `podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution`:
- The scheduler ensures the new Pod is assigned to nodes that satisfy these required affinity and anti-affinity rules based on existing Pods.
2. Soft Constraints (Scoring):
- `podAffinity.preferredDuringSchedulingIgnoredDuringExecution` and `podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
- The scheduler scores nodes based on how well they meet these preferred affinity and anti-affinity rules to optimize Pod placement.
3. Ignored Fields:
- Existing Pods' `podAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
- These preferred affinity rules are not considered during the scheduling decision for new Pods.
- Existing Pods' `podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution`:
- Similarly, preferred anti-affinity rules of existing Pods are ignored during scheduling.
#### Scheduling a Group of Pods with Inter-pod Affinity to Themselves
#### Scheduling a group of pods with inter-pod affinity to themselves
If the current Pod being scheduled is the first in a series that have affinity to themselves,
it is allowed to be scheduled if it passes all other affinity checks. This is determined by
verifying that no other Pod in the cluster matches the namespace and selector of this Pod,
that the Pod matches its own terms, and the chosen node matches all requested topologies.
This ensures that there will not be a deadlock even if all the Pods have inter-pod affinity
verifying that no other pod in the cluster matches the namespace and selector of this pod,
that the pod matches its own terms, and the chosen node matches all requested topologies.
This ensures that there will not be a deadlock even if all the pods have inter-pod affinity
specified.
#### Pod Affinity Example {#an-example-of-a-pod-that-uses-pod-affinity}
#### Pod affinity example {#an-example-of-a-pod-that-uses-pod-affinity}
Consider the following Pod spec:
@ -369,7 +349,7 @@ of namespaces which the `labelSelector` should match against using the
If omitted or empty, `namespaces` defaults to the namespace of the Pod where the
affinity/anti-affinity definition appears.
#### Namespace Selector
#### Namespace selector
{{< feature-state for_k8s_version="v1.24" state="stable" >}}
@ -391,12 +371,12 @@ When you want to disable it, you have to disable it explicitly via the
{{< /note >}}
Kubernetes includes an optional `matchLabelKeys` field for Pod affinity
or anti-affinity. The field specifies keys for the labels that should match with the incoming Pod's labels,
or anti-affinity. The field specifies keys for the labels that should match with the incoming Pod's labels,
when satisfying the Pod (anti)affinity.
The keys are used to look up values from the Pod labels; those key-value labels are combined
The keys are used to look up values from the pod labels; those key-value labels are combined
(using `AND`) with the match restrictions defined using the `labelSelector` field. The combined
filtering selects the set of existing Pods that will be taken into Pod (anti)affinity calculation.
filtering selects the set of existing pods that will be taken into Pod (anti)affinity calculation.
{{< caution >}}
It's not recommended to use `matchLabelKeys` with labels that might be updated directly on pods.
@ -448,7 +428,7 @@ When you want to disable it, you have to disable it explicitly via the
{{< /note >}}
Kubernetes includes an optional `mismatchLabelKeys` field for Pod affinity
or anti-affinity. The field specifies keys for the labels that should not match with the incoming Pod's labels,
or anti-affinity. The field specifies keys for the labels that should **not** match with the incoming Pod's labels,
when satisfying the Pod (anti)affinity.
{{< caution >}}
@ -472,20 +452,20 @@ spec:
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
# ensure that Pods associated with this tenant land on the correct node pool
# ensure that pods associated with this tenant land on the correct node pool
- matchLabelKeys:
- tenant
topologyKey: node-pool
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
# ensure that Pods associated with this tenant can't schedule to nodes used for another tenant
# ensure that pods associated with this tenant can't schedule to nodes used for another tenant
- mismatchLabelKeys:
- tenant # whatever the value of the "tenant" label for this Pod, prevent
# scheduling to nodes in any pool where any Pod from a different
# tenant is running.
labelSelector:
# We have to have the labelSelector which selects only Pods with the tenant label,
# otherwise this Pod would have anti-affinity against Pods from daemonsets as well, for example,
# otherwise this Pod would have Pods from daemonsets as well, for example,
# which aren't supposed to have the tenant label.
matchExpressions:
- key: tenant
@ -669,10 +649,10 @@ The following operators can only be used with `nodeAffinity`.
| `Gt` | The field value will be parsed as an integer, and that integer is less than the integer that results from parsing the value of a label named by this selector |
| `Lt` | The field value will be parsed as an integer, and that integer is greater than the integer that results from parsing the value of a label named by this selector |
{{<note>}}
{{<note>}}
`Gt` and `Lt` operators will not work with non-integer values. If the given value
doesn't parse as an integer, the Pod will fail to get scheduled. Also, `Gt` and `Lt`
doesn't parse as an integer, the pod will fail to get scheduled. Also, `Gt` and `Lt`
are not available for `podAffinity`.
{{</note>}}
@ -684,4 +664,4 @@ are not available for `podAffinity`.
- Learn about how the [topology manager](/docs/tasks/administer-cluster/topology-manager/) takes part in node-level
resource allocation decisions.
- Learn how to use [nodeSelector](/docs/tasks/configure-pod-container/assign-pods-nodes/).
- Learn how to use [affinity and anti-affinity](/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/).
- Learn how to use [affinity and anti-affinity](/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/).

View File

@ -30,319 +30,165 @@ api_metadata:
{{< feature-state feature_gate_name="DynamicResourceAllocation" >}}
This page describes _dynamic resource allocation (DRA)_ in Kubernetes.
Dynamic resource allocation is an API for requesting and sharing resources
between pods and containers inside a pod. It is a generalization of the
persistent volumes API for generic resources. Typically those resources
are devices like GPUs.
Third-party resource drivers are
responsible for tracking and preparing resources, with allocation of
resources handled by Kubernetes via _structured parameters_ (introduced in Kubernetes 1.30).
Different kinds of resources support arbitrary parameters for defining requirements and
initialization.
Kubernetes v1.26 through to 1.31 included an (alpha) implementation of _classic DRA_,
which is no longer supported. This documentation, which is for Kubernetes
v{{< skew currentVersion >}}, explains the current approach to dynamic resource
allocation within Kubernetes.
## {{% heading "prerequisites" %}}
Kubernetes v{{< skew currentVersion >}} includes cluster-level API support for
dynamic resource allocation, but it [needs to be enabled](#enabling-dynamic-resource-allocation)
explicitly. You also must install a resource driver for specific resources that
are meant to be managed using this API. If you are not running Kubernetes
v{{< skew currentVersion>}}, check the documentation for that version of Kubernetes.
<!-- body -->
## About DRA {#about-dra}
## API
{{< glossary_definition prepend="DRA is" term_id="dra" length="all" >}}
Allocating resources with DRA is a similar experience to
[dynamic volume provisioning](/docs/concepts/storage/dynamic-provisioning/), in
which you use PersistentVolumeClaims to claim storage capacity from storage
classes and request the claimed capacity in your Pods.
### Benefits of DRA {#dra-benefits}
DRA provides a flexible way to categorize, request, and use devices in your
cluster. Using DRA provides benefits like the following:
* **Flexible device filtering**: use common expression language (CEL) to perform
fine-grained filtering for specific device attributes.
* **Device sharing**: share the same resource with multiple containers or Pods
by referencing the corresponding resource claim.
* **Centralized device categorization**: device drivers and cluster admins can
use device classes to provide app operators with hardware categories that are
optimized for various use cases. For example, you can create a cost-optimized
device class for general-purpose workloads, and a high-performance device
class for critical jobs.
* **Simplified Pod requests**: with DRA, app operators don't need to specify
device quantities in Pod resource requests. Instead, the Pod references a
resource claim, and the device configuration in that claim applies to the Pod.
These benefits provide significant improvements in the device allocation
workflow when compared to
[device plugins](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/),
which require per-container device requests, don't support device sharing, and
don't support expression-based device filtering.
### Types of DRA users {#dra-user-types}
The workflow of using DRA to allocate devices involves the following types of
users:
* **Device owner**: responsible for devices. Device owners might be commercial
vendors, the cluster operator, or another entity. To use DRA, devices must
have DRA-compatible drivers that do the following:
* Create ResourceSlices that provide Kubernetes with information about
nodes and resources.
* Update ResourceSlices when resource capacity in the cluster changes.
* Optionally, create DeviceClasses that workload operators can use to
claim devices.
* **Cluster admin**: responsible for configuring clusters and nodes,
attaching devices, installing drivers, and similar tasks. To use DRA,
cluster admins do the following:
* Attach devices to nodes.
* Install device drivers that support DRA.
* Optionally, create DeviceClasses that workload operators can use to claim
devices.
* **Workload operator**: responsible for deploying and managing workloads in the
cluster. To use DRA to allocate devices to Pods, workload operators do the
following:
* Create ResourceClaims or ResourceClaimTemplates to request specific
configurations within DeviceClasses.
* Deploy workloads that use specific ResourceClaims or ResourceClaimTemplates.
## DRA terminology {#terminology}
DRA uses the following Kubernetes API kinds to provide the core allocation
functionality. All of these API kinds are included in the
`resource.k8s.io/v1beta1`
{{< glossary_tooltip text="API group" term_id="api-group" >}}.
DeviceClass
: Defines a category of devices that can be claimed and how to select specific
device attributes in claims. The DeviceClass parameters can match zero or
more devices in ResourceSlices. To claim devices from a DeviceClass,
ResourceClaims select specific device attributes.
The `resource.k8s.io/v1beta1` and `resource.k8s.io/v1beta2`
{{< glossary_tooltip text="API groups" term_id="api-group" >}} provide these types:
ResourceClaim
: Describes a request for access to attached resources, such as
devices, in the cluster. ResourceClaims provide Pods with access to
a specific resource. ResourceClaims can be created by workload operators
or generated by Kubernetes based on a ResourceClaimTemplate.
: Describes a request for access to resources in the cluster,
for use by workloads. For example, if a workload needs an accelerator device
with specific properties, this is how that request is expressed. The status
stanza tracks whether this claim has been satisfied and what specific
resources have been allocated.
ResourceClaimTemplate
: Defines a template that Kubernetes uses to create per-Pod
ResourceClaims for a workload. ResourceClaimTemplates provide Pods with
access to separate, similar resources. Each ResourceClaim that Kubernetes
generates from the template is bound to a specific Pod. When the Pod
terminates, Kubernetes deletes the corresponding ResourceClaim.
: Defines the spec and some metadata for creating
ResourceClaims. Created by a user when deploying a workload.
The per-Pod ResourceClaims are then created and removed by Kubernetes
automatically.
DeviceClass
: Contains pre-defined selection criteria for certain devices and
configuration for them. DeviceClasses are created by a cluster administrator
when installing a resource driver. Each request to allocate a device
in a ResourceClaim must reference exactly one DeviceClass.
ResourceSlice
: Represents one or more resources that are attached to nodes, such as devices.
Drivers create and manage ResourceSlices in the cluster. When a ResourceClaim
is created and used in a Pod, Kubernetes uses ResourceSlices to find nodes
that have access to the claimed resources. Kubernetes allocates resources to
the ResourceClaim and schedules the Pod onto a node that can access the
resources.
: Used by DRA drivers to publish information about resources (typically devices)
that are available in the cluster.
### DeviceClass {#deviceclass}
DeviceTaintRule
: Used by admins or control plane components to add device taints
to the devices described in ResourceSlices.
A DeviceClass lets cluster admins or device drivers define categories of devices
in the cluster. DeviceClasses tell operators what devices they can request and
how they can request those devices. You can use
[common expression language (CEL)](https://cel.dev) to select devices based on
specific attributes. A ResourceClaim that references the DeviceClass can then
request specific configurations within the DeviceClass.
All parameters that select devices are defined in the ResourceClaim and
DeviceClass with in-tree types. Configuration parameters can be embedded there.
Which configuration parameters are valid depends on the DRA driver -- Kubernetes
only passes them through without interpreting them.
To create a DeviceClass, see
[Set Up DRA in a Cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster).
The `core/v1` `PodSpec` defines ResourceClaims that are needed for a Pod in a
`resourceClaims` field. Entries in that list reference either a ResourceClaim
or a ResourceClaimTemplate. When referencing a ResourceClaim, all Pods using
this PodSpec (for example, inside a Deployment or StatefulSet) share the same
ResourceClaim instance. When referencing a ResourceClaimTemplate, each Pod gets
its own instance.
### ResourceClaims and ResourceClaimTemplates {#resourceclaims-templates}
The `resources.claims` list for container resources defines whether a container gets
access to these resource instances, which makes it possible to share resources
between one or more containers.
A ResourceClaim defines the resources that a workload needs. Every ResourceClaim
has _requests_ that reference a DeviceClass and select devices from that
DeviceClass. ResourceClaims can also use _selectors_ to filter for devices that
meet specific requirements, and can use _constraints_ to limit the devices that
can satisfy a request. ResourceClaims can be created by workload operators or
can be generated by Kubernetes based on a ResourceClaimTemplate. A
ResourceClaimTemplate defines a template that Kubernetes can use to
auto-generate ResourceClaims for Pods.
#### Use cases for ResourceClaims and ResourceClaimTemplates {#when-to-use-rc-rct}
The method that you use depends on your requirements, as follows:
* **ResourceClaim**: you want multiple Pods to share access to specific
devices. You manually manage the lifecycle of ResourceClaims that you create.
* **ResourceClaimTemplate**: you want Pods to have independent access to
separate, similarly-configured devices. Kubernetes generates ResourceClaims
from the specification in the ResourceClaimTemplate. The lifetime of each
generated ResourceClaim is bound to the lifetime of the corresponding Pod.
When you define a workload, you can use
{{< glossary_tooltip term_id="cel" text="Common Expression Language (CEL)" >}}
to filter for specific device attributes or capacity. The available parameters
for filtering depend on the device and the drivers.
If you directly reference a specific ResourceClaim in a Pod, that ResourceClaim
must already exist in the same namespace as the Pod. If the ResourceClaim
doesn't exist in the namespace, the Pod won't schedule. This behavior is similar
to how a PersistentVolumeClaim must exist in the same namespace as a Pod that
references it.
You can reference an auto-generated ResourceClaim in a Pod, but this isn't
recommended because auto-generated ResourceClaims are bound to the lifetime of
the Pod that triggered the generation.
To learn how to claim resources using one of these methods, see
[Allocate Devices to Workloads with DRA](/docs/tasks/configure-pod-container/assign-resources/allocate-devices-dra/).
### ResourceSlice {#resourceslice}
Each ResourceSlice represents one or more
{{< glossary_tooltip term_id="device" text="devices" >}} in a pool. The pool is
managed by a device driver, which creates and manages ResourceSlices. The
resources in a pool might be represented by a single ResourceSlice or span
multiple ResourceSlices.
ResourceSlices provide useful information to device users and to the scheduler,
and are crucial for dynamic resource allocation. Every ResourceSlice must include
the following information:
* **Resource pool**: a group of one or more resources that the driver manages.
The pool can span more than one ResourceSlice. Changes to the resources in a
pool must be propagated across all of the ResourceSlices in that pool. The
device driver that manages the pool is responsible for ensuring that this
propagation happens.
* **Devices**: devices in the managed pool. A ResourceSlice can list every
device in a pool or a subset of the devices in a pool. The ResourceSlice
defines device information like attributes, versions, and capacity. Device
users can select devices for allocation by filtering for device information
in ResourceClaims or in DeviceClasses.
* **Nodes**: the nodes that can access the resources. Drivers can choose which
nodes can access the resources, whether that's all of the nodes in the
cluster, a single named node, or nodes that have specific node labels.
Drivers use a {{< glossary_tooltip text="controller" term_id="controller" >}} to
reconcile ResourceSlices in the cluster with the information that the driver has
to publish. This controller overwrites any manual changes, such as cluster users
creating or modifying ResourceSlices.
Consider the following example ResourceSlice:
Here is an example for a fictional resource driver. Two ResourceClaim objects
will get created for this Pod and each container gets access to one of them.
```yaml
apiVersion: resource.k8s.io/v1beta1
kind: ResourceSlice
apiVersion: resource.k8s.io/v1beta2
kind: DeviceClass
metadata:
name: cat-slice
name: resource.example.com
spec:
driver: "resource-driver.example.com"
pool:
generation: 1
name: "black-cat-pool"
resourceSliceCount: 1
# The allNodes field defines whether any node in the cluster can access the device.
allNodes: true
devices:
- name: "large-black-cat"
basic:
attributes:
color:
string: "black"
size:
string: "large"
cat:
boolean: true
selectors:
- cel:
expression: device.driver == "resource-driver.example.com"
---
apiVersion: resource.k8s.io/v1beta2
kind: ResourceClaimTemplate
metadata:
name: large-black-cat-claim-template
spec:
spec:
devices:
requests:
- name: req-0
exactly:
deviceClassName: resource.example.com
selectors:
- cel:
expression: |-
device.attributes["resource-driver.example.com"].color == "black" &&
device.attributes["resource-driver.example.com"].size == "large"
---
apiVersion: v1
kind: Pod
metadata:
name: pod-with-cats
spec:
containers:
- name: container0
image: ubuntu:20.04
command: ["sleep", "9999"]
resources:
claims:
- name: cat-0
- name: container1
image: ubuntu:20.04
command: ["sleep", "9999"]
resources:
claims:
- name: cat-1
resourceClaims:
- name: cat-0
resourceClaimTemplateName: large-black-cat-claim-template
- name: cat-1
resourceClaimTemplateName: large-black-cat-claim-template
```
This ResourceSlice is managed by the `resource-driver.example.com` driver in the
`black-cat-pool` pool. The `allNodes: true` field indicates that any node in the
cluster can access the devices. There's one device in the ResourceSlice, named
`large-black-cat`, with the following attributes:
* `color`: `black`
* `size`: `large`
* `cat`: `true`
## Scheduling
A DeviceClass could select this ResourceSlice by using these attributes, and a
ResourceClaim could filter for specific devices in that DeviceClass.
The scheduler is responsible for allocating resources to a ResourceClaim whenever a pod needs
them. It does so by retrieving the full list of available resources from
ResourceSlice objects, tracking which of those resources have already been
allocated to existing ResourceClaims, and then selecting from those resources
that remain.
## How resource allocation with DRA works {#how-it-works}
The only kind of supported resources at the moment are devices. A device
instance has a name and several attributes and capacities. Devices get selected
through CEL expressions which check those attributes and capacities. In
addition, the set of selected devices also can be restricted to sets which meet
certain constraints.
The following sections describe the workflow for the various
[types of DRA users](#dra-user-types) and for the Kubernetes system during
dynamic resource allocation.
The chosen resource is recorded in the ResourceClaim status together with any
vendor-specific configuration, so when a pod is about to start on a node, the
resource driver on the node has all the information it needs to prepare the
resource.
### Workflow for users {#user-workflow}
By using structured parameters, the scheduler is able to reach a decision
without communicating with any DRA resource drivers. It is also able to
schedule multiple pods quickly by keeping information about ResourceClaim
allocations in memory and writing this information to the ResourceClaim objects
in the background while concurrently binding the pod to a node.
1. **Driver creation**: device owners or third-party entities create drivers
that can create and manage ResourceSlices in the cluster. These drivers
optionally also create DeviceClasses that define a category of devices and
how to request them.
1. **Cluster configuration**: cluster admins create clusters, attach devices to
nodes, and install the DRA device drivers. Cluster admins optionally create
DeviceClasses that define categories of devices and how to request them.
1. **Resource claims**: workload operators create ResourceClaimTemplates or
ResourceClaims that request specific device configurations within a
DeviceClass. In the same step, workload operators modify their Kubernetes
manifests to request those ResourceClaimTemplates or ResourceClaims.
## Monitoring resources
### Workflow for Kubernetes {#kubernetes-workflow}
1. **ResourceSlice creation**: drivers in the cluster create ResourceSlices that
represent one or more devices in a managed pool of similar devices.
1. **Workload creation**: the cluster control plane checks new workloads for
references to ResourceClaimTemplates or to specific ResourceClaims.
* If the workload uses a ResourceClaimTemplate, a controller named the
`resourceclaim-controller` generates ResourceClaims for every Pod in the
workload.
* If the workload uses a specific ResourceClaim, Kubernetes checks whether
that ResourceClaim exists in the cluster. If the ResourceClaim doesn't
exist, the Pods won't deploy.
1. **ResourceSlice filtering**: for every Pod, Kubernetes checks the
ResourceSlices in the cluster to find a device that satisfies all of the
following criteria:
* The nodes that can access the resources are eligible to run the Pod.
* The ResourceSlice has unallocated resources that match the requirements of
the Pod's ResourceClaim.
1. **Resource allocation**: after finding an eligible ResourceSlice for a
Pod's ResourceClaim, the Kubernetes scheduler updates the ResourceClaim
with the allocation details.
1. **Pod scheduling**: when resource allocation is complete, the scheduler
places the Pod on a node that can access the allocated resource. The device
driver and the kubelet on that node configure the device and the Pod's access
to the device.
## Observability of dynamic resources {#observability-dynamic-resources}
You can check the status of dynamically allocated resources by using any of the
following methods:
* [kubelet device metrics](#monitoring-resources)
* [ResourceClaim status](#resourceclaim-device-status)
### kubelet device metrics {#monitoring-resources}
The `PodResourcesLister` kubelet gRPC service lets you monitor in-use devices.
The `DynamicResource` message provides information that's specific to dynamic
resource allocation, such as the device name and the claim name. For details,
see
[Monitoring device plugin resources](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#monitoring-device-plugin-resources).
### ResourceClaim device status {#resourceclaim-device-status}
{{< feature-state feature_gate_name="DRAResourceClaimDeviceStatus" >}}
DRA drivers can report driver-specific
[device status](/docs/concepts/overview/working-with-objects/#object-spec-and-status)
data for each allocated device in the `status.devices` field of a ResourceClaim.
For example, the driver might list the IP addresses that are assigned to a
network interface device.
The accuracy of the information that a driver adds to a ResourceClaim
`status.devices` field depends on the driver. Evaluate drivers to decide whether
you can rely on this field as the only source of device information.
If you disable the `DRAResourceClaimDeviceStatus`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/), the
`status.devices` field automatically gets cleared when storing the ResourceClaim.
A ResourceClaim device status is supported when it is possible, from a DRA
driver, to update an existing ResourceClaim where the `status.devices` field is
set.
For details about the `status.devices` field, see the
{{< api-reference page="workload-resources/resource-claim-v1beta1" anchor="ResourceClaimStatus" text="ResourceClaim" >}} API reference.
The kubelet provides a gRPC service to enable discovery of dynamic resources of
running Pods. For more information on the gRPC endpoints, see the
[resource allocation reporting](/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#monitoring-device-plugin-resources).
## Pre-scheduled Pods
@ -379,17 +225,7 @@ spec:
You may also be able to mutate the incoming Pod, at admission time, to unset
the `.spec.nodeName` field and to use a node selector instead.
## DRA alpha features {#alpha-features}
The following sections describe DRA features that are available in the Alpha
[feature stage](/docs/reference/command-line-tools-reference/feature-gates/#feature-stages).
To use any of these features, you must also set up DRA in your clusters by
enabling the DynamicResourceAllocation feature gate and the DRA
{{< glossary_tooltip text="API groups" term_id="api-group" >}}. For more
information, see
[Set up DRA in the cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster/).
### Admin access {#admin-access}
## Admin access
{{< feature-state feature_gate_name="DRAAdminAccess" >}}
@ -422,9 +258,26 @@ multi-tenant clusters. Starting with Kubernetes v1.33, only users authorized to
create ResourceClaim or ResourceClaimTemplate objects in namespaces labeled with
`resource.k8s.io/admin-access: "true"` (case-sensitive) can use the
`adminAccess` field. This ensures that non-admin users cannot misuse the
feature.
feature.
### Prioritized list {#prioritized-list}
## ResourceClaim Device Status
{{< feature-state feature_gate_name="DRAResourceClaimDeviceStatus" >}}
The drivers can report driver-specific device status data for each allocated device
in a resource claim. For example, IPs assigned to a network interface device can be
reported in the ResourceClaim status.
The drivers setting the status, the accuracy of the information depends on the implementation
of those DRA Drivers. Therefore, the reported status of the device may not always reflect the
real time changes of the state of the device.
When the feature is disabled, that field automatically gets cleared when storing the ResourceClaim.
A ResourceClaim device status is supported when it is possible, from a DRA driver, to update an
existing ResourceClaim where the `status.devices` field is set.
## Prioritized List
{{< feature-state feature_gate_name="DRAPrioritizedList" >}}
@ -468,11 +321,7 @@ spec:
count: 2
```
Prioritized lists is an *alpha feature* and only enabled when the
`DRAPrioritizedList` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver and kube-scheduler.
### Partitionable devices {#partitionable-devices}
## Partitionable Devices
{{< feature-state feature_gate_name="DRAPartitionableDevices" >}}
@ -525,12 +374,7 @@ spec:
value: 6Gi
```
Partitionable devices is an *alpha feature* and only enabled when the
`DRAPartitionableDevices`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver and kube-scheduler.
### Device taints and tolerations {#device-taints-and-tolerations}
## Device taints and tolerations
{{< feature-state feature_gate_name="DRADeviceTaints" >}}
@ -564,22 +408,15 @@ Allocating a device with admin access (described [above](#admin-access))
is not exempt either. An admin using that mode must explicitly tolerate all taints
to access tainted devices.
Device taints and tolerations is an *alpha feature* and only enabled when the
`DRADeviceTaints` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver, kube-controller-manager and kube-scheduler.
To use DeviceTaintRules, the `resource.k8s.io/v1alpha3` API version must be
enabled.
Taints can be added to devices in two different ways:
You can add taints to devices in the following ways, by using the
DeviceTaintRule API kind.
#### Taints set by the driver
### Taints set by the driver
A DRA driver can add taints to the device information that it publishes in ResourceSlices.
Consult the documentation of a DRA driver to learn whether the driver uses taints and what
their keys and values are.
#### Taints set by an admin
### Taints set by an admin
An admin or a control plane component can taint devices without having to tell
the DRA driver to include taints in its device information in ResourceSlices. They do that by
@ -626,10 +463,84 @@ spec:
effect: NoExecute
```
## Enabling dynamic resource allocation
Dynamic resource allocation is a *beta feature* which is off by default and only enabled when the
`DynamicResourceAllocation` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
and the `resource.k8s.io/v1beta1` and `resource.k8s.io/v1beta2` {{< glossary_tooltip text="API groups" term_id="api-group" >}}
are enabled. For details on that, see the `--feature-gates` and `--runtime-config`
[kube-apiserver parameters](/docs/reference/command-line-tools-reference/kube-apiserver/).
kube-scheduler, kube-controller-manager and kubelet also need the feature gate.
When a resource driver reports the status of the devices, then the
`DRAResourceClaimDeviceStatus` feature gate has to be enabled in addition to
`DynamicResourceAllocation`.
A quick check whether a Kubernetes cluster supports the feature is to list
DeviceClass objects with:
```shell
kubectl get deviceclasses
```
If your cluster supports dynamic resource allocation, the response is either a
list of DeviceClass objects or:
```
No resources found
```
If not supported, this error is printed instead:
```
error: the server doesn't have a resource type "deviceclasses"
```
The default configuration of kube-scheduler enables the "DynamicResources"
plugin if and only if the feature gate is enabled and when using
the v1 configuration API. Custom configurations may have to be modified to
include it.
In addition to enabling the feature in the cluster, a resource driver also has to
be installed. Please refer to the driver's documentation for details.
### Enabling admin access
[Admin access](#admin-access) is an *alpha feature* and only enabled when the
`DRAAdminAccess` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver and kube-scheduler.
### Enabling Device Status
[ResourceClaim Device Status](#resourceclaim-device-status) is an *alpha feature*
and only enabled when the `DRAResourceClaimDeviceStatus`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver.
### Enabling Prioritized List
[Prioritized List](#prioritized-list)) is an *alpha feature* and only enabled when the
`DRAPrioritizedList` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver and kube-scheduler. It also requires that the
`DynamicResourceAllocation` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled.
### Enabling Partitionable Devices
[Partitionable Devices](#partitionable-devices) is an *alpha feature*
and only enabled when the `DRAPartitionableDevices`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver and kube-scheduler.
### Enabling device taints and tolerations
[Device taints and tolerations](#device-taints-and-tolerations) is an *alpha feature* and only enabled when the
`DRADeviceTaints` [feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
is enabled in the kube-apiserver, kube-controller-manager and kube-scheduler. To use DeviceTaintRules, the
`resource.k8s.io/v1alpha3` API version must be enabled.
## {{% heading "whatsnext" %}}
- [Set Up DRA in a Cluster](/docs/tasks/configure-pod-container/assign-resources/set-up-dra-cluster/)
- [Allocate devices to workloads using DRA](/docs/tasks/configure-pod-container/assign-resources/allocate-devices-dra/)
- For more information on the design, see the
[Dynamic Resource Allocation with Structured Parameters](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4381-dra-structured-parameters)
KEP.
KEP.

View File

@ -6,9 +6,9 @@ weight: 100
{{<glossary_definition term_id="node-pressure-eviction" length="short">}}</br>
{{< feature-state feature_gate_name="KubeletSeparateDiskGC" >}}
{{<note>}}
{{< feature-state feature_gate_name="KubeletSeparateDiskGC" >}}
The _split image filesystem_ feature, which enables support for the `containerfs`
filesystem, adds several new eviction signals, thresholds and metrics. To use
`containerfs`, the Kubernetes release v{{< skew currentVersion >}} requires the

View File

@ -1,96 +0,0 @@
---
title: "Hardening Guide - Scheduler Configuration"
description: >
Information about how to make the Kubernetes scheduler more secure.
content_type: concept
weight: 90
---
<!-- overview -->
The Kubernetes {{< glossary_tooltip text="scheduler" term_id="kube-scheduler" >}} is
one of the critical components of the
{{< glossary_tooltip text="control plane" term_id="control-plane" >}}.
This document covers how to improve the security posture of the Scheduler.
A misconfigured scheduler can have security implications.
Such a scheduler can target specific nodes and evict the workloads or applications that are sharing the node and its resources.
This can aid an attacker with a [Yo-Yo attack](https://arxiv.org/abs/2105.00542): an attack on a vulnerable autoscaler.
<!-- body -->
## kube-scheduler configuration
### Scheduler authentication & authorization command line options
When setting up authentication configuration, it should be made sure that kube-scheduler's authentication remains consistent with kube-api-server's authentication.
If any request has missing authentication headers,
the [authentication should happen through the kube-api-server allowing all authentication to be consistent in the cluster](/docs/tasks/extend-kubernetes/configure-aggregation-layer/#original-request-username-and-group).
- `authentication-kubeconfig`: Make sure to provide a proper kubeconfig so that the scheduler can retrieve authentication configuration options from the API Server. This kubeconfig file should be protected with strict file permissions.
- `authentication-tolerate-lookup-failure`: Set this to `false` to make sure the scheduler _always_ looks up its authentication configuration from the API server.
- `authentication-skip-lookup`: Set this to `false` to make sure the scheduler _always_ looks up its authentication configuration from the API server.
- `authorization-always-allow-paths`: These paths should respond with data that is appropriate for anonymous authorization. Defaults to `/healthz,/readyz,/livez`.
- `profiling`: Set to `false` to disable the profiling endpoints which are provide debugging information but which should not be enabled on production clusters as they present a risk of denial of service or information leakage. The `--profiling` argument is deprecated and can now be provided through the [KubeScheduler DebuggingConfiguration](https://kubernetes.io/docs/reference/config-api/kube-scheduler-config.v1/#DebuggingConfiguration). Profiling can be disabled through the kube-scheduler config by setting `enableProfiling` to `false`.
- `requestheader-client-ca-file`: Avoid passing this argument.
### Scheduler networking command line options
- `bind-address`: In most cases, the kube-scheduler does not need to be externally accessible. Setting the bind address to `localhost` is a secure practice.
- `permit-address-sharing`: Set this to `false` to disable connection sharing through `SO_REUSEADDR`. `SO_REUSEADDR` can lead to reuse of terminated connections that are in `TIME_WAIT` state.
- `permit-port-sharing`: Default `false`. Use the default unless you are confident you understand the security implications.
### Scheduler TLS command line options
- `tls-cipher-suites`: Always provide a list of preferred cipher suites. This ensures encryption never happens with insecure cipher suites.
## Scheduling configurations for custom schedulers
When using custom schedulers based on the Kubernetes scheduling code, cluster administrators need to be careful with
plugins that use the `queueSort`, `prefilter`, `filter`, or `permit` [extension points](/docs/reference/scheduling/config/#extension-points).
These extension points control various stages of a scheduling process, and the wrong configuration can impact the kube-scheduler's behavior in your cluster.
### Key considerations
- Exactly one plugin that uses the `queueSort` extension point can be enabled at a time. Any plugins that use `queueSort` should be scrutinized.
- Plugins that implement the `prefilter` or `filter` extension point can potentially mark all nodes as unschedulable. This can bring scheduling of new pods to a halt.
- Plugins that implement the `permit` extension point can prevent or delay the binding of a Pod. Such plugins should be thoroughly reviewed by the cluster administrator.
When using a plugin that is not one of the [default plugins](/docs/reference/scheduling/config/#scheduling-plugins), consider disabling the `queueSort`, `filter` and `permit` extension points as follows:
```yaml
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
profiles:
- schedulerName: my-scheduler
plugins:
# Disable specific plugins for different extension points
# You can disable all plugins for an extension point using "*"
queueSort:
disabled:
- name: "*" # Disable all queueSort plugins
# - name: "PrioritySort" # Disable specific queueSort plugin
filter:
disabled:
- name: "*" # Disable all filter plugins
# - name: "NodeResourcesFit" # Disable specific filter plugin
permit:
disabled:
- name: "*" # Disables all permit plugins
# - name: "TaintToleration" # Disable specific permit plugin
```
This creates a scheduler profile ` my-custom-scheduler`.
Whenever the `.spec` of a Pod does not have a value for `.spec.schedulerName`, the kube-scheduler runs for that Pod,
using its main configuration, and default plugins.
If you define a Pod with `.spec.schedulerName` set to `my-custom-scheduler`, the kube-scheduler runs but with a custom configuration; in that custom configuration,
the `queueSort`, `filter` and `permit` extension points are disabled.
If you use this KubeSchedulerConfiguration, and don't run any custom scheduler,
and you then define a Pod with `.spec.schedulerName` set to `nonexistent-scheduler`
(or any other scheduler name that doesn't exist in your cluster), no events would be generated for a pod.
## Disallow labeling nodes
A cluster administrator should ensure that cluster users cannot label the nodes.
A malicious actor can use `nodeSelector` to schedule workloads on nodes where those workloads should not be present.

View File

@ -1,29 +0,0 @@
---
reviewers:
- lmktfy
title: Security For Linux Nodes
content_type: concept
weight: 40
---
<!-- overview -->
This page describes security considerations and best practices specific to the Linux operating system.
<!-- body -->
## Protection for Secret data on nodes
On Linux nodes, memory-backed volumes (such as [`secret`](/docs/concepts/configuration/secret/)
volume mounts, or [`emptyDir`](/docs/concepts/storage/volumes/#emptydir) with `medium: Memory`)
are implemented with a `tmpfs` filesystem.
If you have swap configured and use an older Linux kernel (or a current kernel and an unsupported configuration of Kubernetes),
**memory** backed volumes can have data written to persistent storage.
The Linux kernel officially supports the `noswap` option from version 6.3,
therefore it is recommended the used kernel version is 6.3 or later,
or supports the `noswap` option via a backport, if swap is enabled on the node.
Read [swap memory management](/docs/concepts/cluster-administration/swap-memory-management/#memory-backed-volumes)
for more info.

View File

@ -262,6 +262,8 @@ to ensure that a PersistentVolume cannot be reused across different namespaces.
### Sandboxing containers
{{% thirdparty-content %}}
Kubernetes pods are composed of one or more containers that execute on worker nodes.
Containers utilize OS-level virtualization and hence offer a weaker isolation boundary than
virtual machines that utilize hardware-based virtualization.
@ -284,7 +286,14 @@ workloads running in a shared cluster. Running workloads in a sandbox environmen
insulate the host from container escapes, where an attacker exploits a vulnerability to gain
access to the host system and all the processes/files running on that host.
Virtual machines and userspace kernels are two popular approaches to sandboxing.
Virtual machines and userspace kernels are two popular approaches to sandboxing. The following
sandboxing implementations are available:
* [gVisor](https://gvisor.dev/) intercepts syscalls from containers and runs them through a
userspace kernel, written in Go, with limited access to the underlying host.
* [Kata Containers](https://katacontainers.io/) provide a secure container runtime that allows you to run
containers in a VM. The hardware virtualization available in Kata offers an added layer of
security for containers running untrusted code.
### Node Isolation
@ -311,7 +320,8 @@ corresponding toleration can run on them. A mutating webhook could then be used
add tolerations and node affinities to pods deployed into tenant namespaces so that they run on a
specific set of nodes designated for that tenant.
Node isolation can be implemented using [pod node selectors](/docs/concepts/scheduling-eviction/assign-pod-node/).
Node isolation can be implemented using an [pod node selectors](/docs/concepts/scheduling-eviction/assign-pod-node/)
or a [Virtual Kubelet](https://github.com/virtual-kubelet).
## Additional Considerations
@ -401,6 +411,8 @@ Specifically, the Operator should:
## Implementations
{{% thirdparty-content %}}
There are two primary ways to share a Kubernetes cluster for multi-tenancy: using Namespaces
(that is, a Namespace per tenant) or by virtualizing the control plane (that is, virtual control
plane per tenant).
@ -444,6 +456,27 @@ resources between them. This could include managing namespace labels, namespace
delegated access, and shared resource quotas across related namespaces. These capabilities can
be useful in both multi-team and multi-customer scenarios.
Some third-party projects that provide capabilities like this and aid in managing namespaced resources are
listed below.
{{% thirdparty-content %}}
#### Multi-team tenancy
* [Capsule](https://github.com/clastix/capsule)
* [Multi Tenant Operator](https://docs.stakater.com/mto/)
#### Multi-customer tenancy
* [Kubeplus](https://github.com/cloud-ark/kubeplus)
#### Policy engines
Policy engines provide features to validate and generate tenant configurations:
* [Kyverno](https://kyverno.io/)
* [OPA/Gatekeeper](https://github.com/open-policy-agent/gatekeeper)
### Virtual control plane per tenant
Another form of control-plane isolation is to use Kubernetes extensions to provide each tenant a
@ -475,3 +508,11 @@ The improved isolation comes at the cost of running and maintaining an individu
plane per tenant. In addition, per-tenant control planes do not solve isolation problems in the
data plane, such as node-level noisy neighbors or security threats. These must still be addressed
separately.
The Kubernetes [Cluster API - Nested (CAPN)](https://github.com/kubernetes-sigs/cluster-api-provider-nested/tree/main/virtualcluster)
project provides an implementation of virtual control planes.
#### Other implementations
* [Kamaji](https://github.com/clastix/kamaji)
* [vcluster](https://github.com/loft-sh/vcluster)

View File

@ -87,11 +87,6 @@ the data.
For a list of supported providers, refer to
[Providers for the Secret Store CSI Driver](https://secrets-store-csi-driver.sigs.k8s.io/concepts.html#provider-for-the-secrets-store-csi-driver).
## Good practices for using swap memory
For best practices for setting swap memory for Linux nodes, please refer to
[swap memory management](/docs/concepts/cluster-administration/swap-memory-management/#good-practice-for-using-swap-in-a-kubernetes-cluster).
## Developers
This section provides good practices for developers to use to improve the

View File

@ -975,7 +975,7 @@ spec:
## Resources
The storage medium (such as Disk or SSD) of an `emptyDir` volume is determined by the
The storage media (such as Disk or SSD) of an `emptyDir` volume is determined by the
medium of the filesystem holding the kubelet root dir (typically
`/var/lib/kubelet`). There is no limit on how much space an `emptyDir` or
`hostPath` volume can consume, and no isolation between containers or

View File

@ -324,10 +324,10 @@ kernel patch.
### Mirantis Container Runtime {#mcr}
[Mirantis Container Runtime](https://docs.mirantis.com/mcr/25.0/overview.html) (MCR)
[Mirantis Container Runtime](https://docs.mirantis.com/mcr/20.10/overview.html) (MCR)
is available as a container runtime for all Windows Server 2019 and later versions.
See [Install MCR on Windows Servers](https://docs.mirantis.com/mcr/25.0/install/mcr-windows.html) for more information.
See [Install MCR on Windows Servers](https://docs.mirantis.com/mcr/20.10/install/mcr-windows.html) for more information.
## Windows OS version compatibility {#windows-os-version-support}

View File

@ -1183,7 +1183,7 @@ replacement will be created immediately (even if the old Pod is still in a Termi
#### Rolling Update Deployment
The Deployment updates Pods in a rolling update
fashion (gradually scale down the old ReplicaSets and scale up the new one) when `.spec.strategy.type==RollingUpdate`. You can specify `maxUnavailable` and `maxSurge` to control
fashion when `.spec.strategy.type==RollingUpdate`. You can specify `maxUnavailable` and `maxSurge` to control
the rolling update process.
##### Max Unavailable
@ -1202,7 +1202,7 @@ at all times during the update is at least 70% of the desired Pods.
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the maximum number of Pods
that can be created over the desired number of Pods. The value can be an absolute number (for example, 5) or a
percentage of desired Pods (for example, 10%). The value cannot be 0 if `maxUnavailable` is 0. The absolute number
percentage of desired Pods (for example, 10%). The value cannot be 0 if `MaxUnavailable` is 0. The absolute number
is calculated from the percentage by rounding up. The default value is 25%.
For example, when this value is set to 30%, the new ReplicaSet can be scaled up immediately when the
@ -1331,7 +1331,7 @@ a Pod is considered ready, see [Container Probes](/docs/concepts/workloads/pods/
{{< feature-state feature_gate_name="DeploymentReplicaSetTerminatingReplicas" >}}
You can enable this feature by setting the `DeploymentReplicaSetTerminatingReplicas`
You can enable this feature it by setting the `DeploymentReplicaSetTerminatingReplicas`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/)
and on the [kube-controller-manager](/docs/reference/command-line-tools-reference/kube-controller-manager/)

View File

@ -1,7 +1,7 @@
---
reviewers:
- alculquicondor
- erictune
- mimowo
- soltysh
title: Jobs
api_metadata:

View File

@ -1,10 +1,4 @@
---
# NOTE TO LOCALIZATION TEAMS
#
# If updating front matter for your localization because there is still
# a "feature" key in this page, then you also need to update
# content/??/docs/concepts/architecture/self-healing.md (which is where
# it moved to)
reviewers:
- Kashomon
- bprashanth
@ -13,6 +7,13 @@ title: ReplicaSet
api_metadata:
- apiVersion: "apps/v1"
kind: "ReplicaSet"
feature:
title: Self-healing
anchor: How a ReplicaSet works
description: >
Restarts containers that fail, replaces and reschedules containers when nodes die,
kills containers that don't respond to your user-defined health check,
and doesn't advertise them to clients until they are ready to serve.
content_type: concept
description: >-
A ReplicaSet's purpose is to maintain a stable set of replica Pods running at any given time.
@ -323,7 +324,7 @@ ReplicaSets do not support a rolling update directly.
{{< feature-state feature_gate_name="DeploymentReplicaSetTerminatingReplicas" >}}
You can enable this feature by setting the `DeploymentReplicaSetTerminatingReplicas`
You can enable this feature it by setting the `DeploymentReplicaSetTerminatingReplicas`
[feature gate](/docs/reference/command-line-tools-reference/feature-gates/)
on the [API server](/docs/reference/command-line-tools-reference/kube-apiserver/)
and on the [kube-controller-manager](/docs/reference/command-line-tools-reference/kube-controller-manager/)

Some files were not shown because too many files have changed in this diff Show More