From 18958f82567f74c7573b40647a3df059f831bc53 Mon Sep 17 00:00:00 2001 From: Garrett Rodrigues Date: Wed, 6 Sep 2017 15:00:19 -0700 Subject: [PATCH] addressing Brians comments --- ...emplate.md => Design_Proposal_TEMPLATE.md} | 0 .../{auth => api-machinery}/bulk_watch.md | 0 .../client-package-structure.md | 0 .../{apps => api-machinery}/controller-ref.md | 0 .../{ => api-machinery}/metadata-policy.md | 0 .../{ => api-machinery}/protobuf.md | 0 .../OBSOLETE_templates.md} | 2 + .../selector-generation.md | 0 .../architecture/architecture.dia | Bin 6523 -> 0 bytes .../{ => architecture}/identifiers.md | 0 .../{ => architecture}/namespaces.md | 0 .../principles.md | 0 .../{ => auth}/image-provenance.md | 0 .../design-proposals/{ => auth}/secrets.md | 0 .../{ => autoscaling}/initial-resources.md | 0 .../{ => cluster-lifecycle}/ha_master.md | 0 .../high-availability.md | 0 .../kubelet-tls-bootstrap.md | 0 .../local-cluster-ux.md | 0 .../runtimeconfig.md | 0 .../containerized-mounter.md~ | 43 --- contributors/design-proposals/dir_struct.txt | 332 +++++++++--------- .../control-plane-resilience.md | 0 .../core-metrics-pipeline.md | 0 .../{ => network}/selinux-enhancements.md | 0 .../{ => network}/service-discovery.md | 0 .../{storage => node}/all-in-one-volume.md | 0 .../annotations-downward-api.md | 0 .../{api-machinery => node}/configmap.md | 0 .../{api-machinery => node}/container-init.md | 0 .../{ => node}/cpu-manager.md | 0 .../{ => node}/cri-dockershim-checkpoint.md | 0 .../downward_api_resources_limits_requests.md | 0 .../envvar-configmap.md | 0 .../{sig-cli => node}/expansion.md | 0 .../optional-configmap.md | 0 .../design-proposals/{ => node}/pod-cache.png | Bin .../pod-lifecycle-event-generator.md | 0 .../{ => node}/pod-pid-namespace.md | 0 .../{storage => node}/propagation.md | 0 .../design-proposals/{ => node}/seccomp.md | 0 ...secret-configmap-downwardapi-file-mode.md} | 0 .../design-proposals/{ => node}/selinux.md | 0 .../design-proposals/{ => node}/sysctl.md | 0 .../{ => release}/versioning.md | 0 .../admission_control_limit_range.md | 0 .../admission_control_resource_quota.md | 0 .../resource-management/hugepages.md | 308 ++++++++++++++++ .../resource-quota-scoping.md | 0 .../{ => scheduling}/images/.gitignore | 0 .../{ => scheduling}/images/OWNERS | 0 .../{ => scheduling}/images/preemption_1.png | Bin .../{ => scheduling}/images/preemption_2.png | Bin .../{ => scheduling}/images/preemption_3.png | Bin .../{ => scheduling}/images/preemption_4.png | Bin .../{ => scheduling}/resources.md | 0 .../pod-preset.md | 0 .../{gcp => storage}/containerized-mounter.md | 0 .../{ => storage}/grow-volume-size.md | 0 .../{api-machinery => storage}/pod-safety.md | 0 .../{ => testing}/flakiness-sla.md | 0 61 files changed, 478 insertions(+), 207 deletions(-) rename contributors/design-proposals/{sig-cli/template.md => Design_Proposal_TEMPLATE.md} (100%) rename contributors/design-proposals/{auth => api-machinery}/bulk_watch.md (100%) rename contributors/design-proposals/{ => api-machinery}/client-package-structure.md (100%) rename contributors/design-proposals/{apps => api-machinery}/controller-ref.md (100%) rename contributors/design-proposals/{ => api-machinery}/metadata-policy.md (100%) rename contributors/design-proposals/{ => api-machinery}/protobuf.md (100%) rename contributors/design-proposals/{templates.md => apps/OBSOLETE_templates.md} (99%) rename contributors/design-proposals/{api-machinery => apps}/selector-generation.md (100%) delete mode 100644 contributors/design-proposals/architecture/architecture.dia rename contributors/design-proposals/{ => architecture}/identifiers.md (100%) rename contributors/design-proposals/{ => architecture}/namespaces.md (100%) rename contributors/design-proposals/{api-machinery => architecture}/principles.md (100%) rename contributors/design-proposals/{ => auth}/image-provenance.md (100%) rename contributors/design-proposals/{ => auth}/secrets.md (100%) rename contributors/design-proposals/{ => autoscaling}/initial-resources.md (100%) rename contributors/design-proposals/{ => cluster-lifecycle}/ha_master.md (100%) rename contributors/design-proposals/{ => cluster-lifecycle}/high-availability.md (100%) rename contributors/design-proposals/{node => cluster-lifecycle}/kubelet-tls-bootstrap.md (100%) rename contributors/design-proposals/{ => cluster-lifecycle}/local-cluster-ux.md (100%) rename contributors/design-proposals/{node => cluster-lifecycle}/runtimeconfig.md (100%) delete mode 100644 contributors/design-proposals/containerized-mounter.md~ rename contributors/design-proposals/{ => federation}/control-plane-resilience.md (100%) rename contributors/design-proposals/{ => instrumentation}/core-metrics-pipeline.md (100%) rename contributors/design-proposals/{ => network}/selinux-enhancements.md (100%) rename contributors/design-proposals/{ => network}/service-discovery.md (100%) rename contributors/design-proposals/{storage => node}/all-in-one-volume.md (100%) rename contributors/design-proposals/{apps => node}/annotations-downward-api.md (100%) rename contributors/design-proposals/{api-machinery => node}/configmap.md (100%) rename contributors/design-proposals/{api-machinery => node}/container-init.md (100%) rename contributors/design-proposals/{ => node}/cpu-manager.md (100%) rename contributors/design-proposals/{ => node}/cri-dockershim-checkpoint.md (100%) rename contributors/design-proposals/{ => node}/downward_api_resources_limits_requests.md (100%) rename contributors/design-proposals/{api-machinery => node}/envvar-configmap.md (100%) rename contributors/design-proposals/{sig-cli => node}/expansion.md (100%) rename contributors/design-proposals/{api-machinery => node}/optional-configmap.md (100%) rename contributors/design-proposals/{ => node}/pod-cache.png (100%) rename contributors/design-proposals/{ => node}/pod-lifecycle-event-generator.md (100%) rename contributors/design-proposals/{ => node}/pod-pid-namespace.md (100%) rename contributors/design-proposals/{storage => node}/propagation.md (100%) rename contributors/design-proposals/{ => node}/seccomp.md (100%) rename contributors/design-proposals/{secret-configmap-downwarapi-file-mode.md => node/secret-configmap-downwardapi-file-mode.md} (100%) rename contributors/design-proposals/{ => node}/selinux.md (100%) rename contributors/design-proposals/{ => node}/sysctl.md (100%) rename contributors/design-proposals/{ => release}/versioning.md (100%) rename contributors/design-proposals/{api-machinery => resource-management}/admission_control_limit_range.md (100%) rename contributors/design-proposals/{api-machinery => resource-management}/admission_control_resource_quota.md (100%) create mode 100644 contributors/design-proposals/resource-management/hugepages.md rename contributors/design-proposals/{api-machinery => resource-management}/resource-quota-scoping.md (100%) rename contributors/design-proposals/{ => scheduling}/images/.gitignore (100%) rename contributors/design-proposals/{ => scheduling}/images/OWNERS (100%) rename contributors/design-proposals/{ => scheduling}/images/preemption_1.png (100%) rename contributors/design-proposals/{ => scheduling}/images/preemption_2.png (100%) rename contributors/design-proposals/{ => scheduling}/images/preemption_3.png (100%) rename contributors/design-proposals/{ => scheduling}/images/preemption_4.png (100%) rename contributors/design-proposals/{ => scheduling}/resources.md (100%) rename contributors/design-proposals/{api-machinery => service-catalog}/pod-preset.md (100%) rename contributors/design-proposals/{gcp => storage}/containerized-mounter.md (100%) rename contributors/design-proposals/{ => storage}/grow-volume-size.md (100%) rename contributors/design-proposals/{api-machinery => storage}/pod-safety.md (100%) rename contributors/design-proposals/{ => testing}/flakiness-sla.md (100%) diff --git a/contributors/design-proposals/sig-cli/template.md b/contributors/design-proposals/Design_Proposal_TEMPLATE.md similarity index 100% rename from contributors/design-proposals/sig-cli/template.md rename to contributors/design-proposals/Design_Proposal_TEMPLATE.md diff --git a/contributors/design-proposals/auth/bulk_watch.md b/contributors/design-proposals/api-machinery/bulk_watch.md similarity index 100% rename from contributors/design-proposals/auth/bulk_watch.md rename to contributors/design-proposals/api-machinery/bulk_watch.md diff --git a/contributors/design-proposals/client-package-structure.md b/contributors/design-proposals/api-machinery/client-package-structure.md similarity index 100% rename from contributors/design-proposals/client-package-structure.md rename to contributors/design-proposals/api-machinery/client-package-structure.md diff --git a/contributors/design-proposals/apps/controller-ref.md b/contributors/design-proposals/api-machinery/controller-ref.md similarity index 100% rename from contributors/design-proposals/apps/controller-ref.md rename to contributors/design-proposals/api-machinery/controller-ref.md diff --git a/contributors/design-proposals/metadata-policy.md b/contributors/design-proposals/api-machinery/metadata-policy.md similarity index 100% rename from contributors/design-proposals/metadata-policy.md rename to contributors/design-proposals/api-machinery/metadata-policy.md diff --git a/contributors/design-proposals/protobuf.md b/contributors/design-proposals/api-machinery/protobuf.md similarity index 100% rename from contributors/design-proposals/protobuf.md rename to contributors/design-proposals/api-machinery/protobuf.md diff --git a/contributors/design-proposals/templates.md b/contributors/design-proposals/apps/OBSOLETE_templates.md similarity index 99% rename from contributors/design-proposals/templates.md rename to contributors/design-proposals/apps/OBSOLETE_templates.md index 507129323..010b31a84 100644 --- a/contributors/design-proposals/templates.md +++ b/contributors/design-proposals/apps/OBSOLETE_templates.md @@ -1,3 +1,5 @@ +# OBSOLETE + # Templates+Parameterization: Repeatedly instantiating user-customized application topologies. ## Motivation diff --git a/contributors/design-proposals/api-machinery/selector-generation.md b/contributors/design-proposals/apps/selector-generation.md similarity index 100% rename from contributors/design-proposals/api-machinery/selector-generation.md rename to contributors/design-proposals/apps/selector-generation.md diff --git a/contributors/design-proposals/architecture/architecture.dia b/contributors/design-proposals/architecture/architecture.dia deleted file mode 100644 index 5c87409f266d0af17bbec155c4a43ef8055786a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6523 zcmZA4Rag{?)_`GB>F%DP8w80VrG^gaX6Wt)0ReFs8YyW;HXz;I-Q6kOB`G@ayPtC| z&hxMDZe6ac_0q>+ApJWi*ypCkZo7lYArDmq+37UWbGGKCykI-e#O#e9Gz?j|LWXDG z-l@Y_n&JabMZ;m^s=_ahdA;&dCA9=$xZW+hn`SVvkH2zhP!OksSriCx-4-ODLpAr zjrpKpz^!cBrR4oHq$YdS7*>TJ)TARmgh1@%-kGWd&HcImd>JwV1CDhWe<;5b&dbxk zhj5~kJ~)Jk_bzC`Bg=D|17ELBSvnR`8v?h8t#i1Ew13SZqFNu?BK4jHT_g@Bb96OqRd3!GV zH(iwYj~*TuHn%-{9{J21R5{(+l-KU@mFy% zkf9p$3gE(@%n>x==mW{VMsxD#-KRCUv?tmITy&YRhcsZsl zeSK%otxZwb6sClh|jii^3k4CS5u#WcYh;TMrdX3mqhOF;cemGzSMMPMbw zux+dkzb?f6(nW$RR|n=L^EqmGL%8}Qp>i4kI2PV?P{1{m!HF{oe>)yMGg|~$ zEOx45z31?>-0<{(0-%7cxX|q2OdfI!fT=WwiXK6&u9qx9v=Y~J#k+o>lutqH4nMy& zBQyWE;!vjmympjZ8dJ2qAP#6@xVjR$2Ct)JQ<{_MxaU;6(2GEGow`Fs?m({_0dDTg z1LB`e*1ijI!Prv)oKc9d`iN#od}zICayUo4|7j z>9UGq#OG17r1}@l>oW3H^EMpcW&+=zfbCh$b9-q&WktWAjiWu#{1E>oP`A@hQRRsC zxxRK5_=9W*k(jXD$!y#c3(**`Gn3z(*hAN2t0r(|8>3GADJJ1P7lyKu<%6@0g{-}#syoz+2m>%ZJT5XHnmoZf$y zga(9bhkXKy1O9eLkt>)~9&R3nL`sw&@7AG3h?ei%-ShD9VfV+Jf}W*<#>?hr}eFXenMm;Votb7#SjkOO+aDI>vKK&=-^3|=ATL}5$nWPJxNhiaA$tkNysS2r$g zEo)Nr;xcf<@q;u#7^;Q_=D*JI{0a3a%1gXVKu|cG(@ZYI1_%QD?T#X+KF*hvh%pdJ zorOR|3#@SzMF}vW_FdLwWT=78PznI2l?n3+{uYRw2LNpYqg_`CdKv^1pl+{hQKAIu82c4G9 z5s#3EFX9n?eQf@cE$TPH*DXeK6i&a6P7Q^+BuHk3@l^8UX_TlORqW>zvfSL(G&_s2 zn59&gq{e}CwZ`$64K0Oy?;Tau$wx$L{x)mlSU>#QMIm+ioz{PWDth?H&2kdh&A zEwD4w>cX)g>ax7ahQSgqdYN=zol+Z{MYgO;Cd@%3{I?|MtQDTk8(MNL0OzoAd4!=} zB3}xsK8&gTgMGJ~P;LAa9G1`#7U;Yu{IN4})!sDYLz$ zDsB;~@c)pZMqhbJ_Ul%|uO8w%Jr=BJHx}hO7*o>|qdjadla?T?^84|uwx$V7r^`vA za{ATWD23}TaO5KvtpLj$8j+?R$?{cBbdmtKZHZcE12SbxqJGEp3C=fNU8K{14;6jrP*qoF;uJm(AB1YP$XDLuX<9wn!G9 z`mi`^lU3XCp{vUBKpfNJfHUVHa;U)rQsQly!MDC9f__3(hMY>f>^}nJoYFq`4U)wU zB_^9(HF&&m;95=nG`C3R3bi&;XmXd8X4xF1uv>5VWvf~htAt6q)~iBpxS&CW`zMRf z-6Q!UYZ7jI@L^7qWcY17;l4-80Mmj9yOUKEA*v-LnG@a&Q)YDQXf!%Z5L9ZUDu!lc zoe$p42hWtqKftl*71E9AWBu*4CAZ3oYs?n3Z8-zO$TVpUvy7`M<3Yw= z1;!^dH;R$K;I{6`DTSPbI(V~v=rkDTs|S}9@m&i3YgPpnzKhrdN>7DCXO6lH%)B&} zvVeBGGKsfwEyd0?EST?FW$1JeIZ8{-9OmP;hLX)1hsQ2y=D~S(_;tr3cUr44FHTL!v$tU_0diC&vz4+gO6}mmRbcsJj(&bVFf(7!`7XbX>wjd`y>T__OQsV3kQ=&Y8%BXS6P*vk3?ZspS3P^TmFDiaus2{rJcDV7(a z#49)Ap)W^Qe|0Qk&XpeV+dEh|$8Ky}zLlo=hUucSXJV>{6hE#_JgxLBAV*^q92=-Qxy^%HS=1NF>c0rC8Yd zLjwZqltkxkQgn!+R-BUO4w`#tr4`$ss?ZliLd+#S*_VCy>hYf%b2Xgqdn>2`J(xX` z)f-puUNe)`fW{f-6mw>O}l$~Z~5W&J{e9#jjM$%I&Z@SYpvXQ~*s`(9el>U6cBAm3xPkU!_BSDt3u8u|{8p z1nkcoC-Wg6#o8Io!E3a~Tb+w$R;xet0_+wa=#GhZJ2YGrYv^@y&9;l!9=6}r?NUCe z018xTO(8h?&D8bX?3L3{ea}xU(*3)3^=+luP?~E4(NQcSWzB6yx?Fc~2g-(gzjo{R z$#MxUGY(_!rqH9yTO&VVGc4BwDVtFBy7t~>04CXk^x5s;phV^|zy}d1NbR9iLXNF8 zcQT@Of9QjNzK&-Ric%~vj1x1(r%e2@2*=wzTx$NXkq%5BX-WOgOd*Uw1#6$d61Ni)~G#2Ca(u=JTm_Slcy8eT-fok z2wpJ8w!`IWFU>CmfpYPfl#KsX!q5xienz!1IchfEHT&Mvulbi;-ZjCoow(2 zW0jhjhL42JUU#_)!&5!KJ~(YA5IHm>EZ=Km=KFXb>X$|w{h8!Heh*0I+14h6X1RZS zBe{?4QbSbIqca z%poGiVhQ42V_zXs&$Q18W`z}yu+4^eTydspcP%Qaq6dR;glwf zlnSNMby`{2aS#PR-)(3@Ap?@EMdaWa=CY-{j}fw`pWL5)@&r-J*+jIbrD5^di^ihB zRtFDRYCMJgFhhWAYisJ8zrQKw6HFVdOa*)xus%phQ_YyhZ;CgcY$>`HM#d6`p zkC~&+WIK-l=FxPP>q~9Tm;P?{BWtWVOi8X+WO1-uJ*>O; z3YFf@Q6}>7&#OFTdN_)io-vsds6*FpXRZ%-cjFKnBpB~z0E&d?MJ*m5CGt=of-A#d z_rdB6QF!PLqeP^fVfM%UY+*^&EomLo7dT!z?e-mwps)hya_t*U!fjb1Oh9CZ438e; zm4ljbJ>D>178NFgc%`JVMzLOa#+rNKkAh6A^ah_{f`xA0+Z4!xXcw zHW&>5MtiVu!Iq89%iIeFUlpLtTQqp8rw#G+Iv(`%Vd6I4z7m@V{ zuhg@U!dUPrL@9H4HLOwe|iY2?mb;PKqbBJbh+G{-dfF<;s4R}|X!SS;Um$!UX z(MJf)s`@UpquGxChsfmrMP$oyRa?+T4a2C*dey^EZb^^ErgVDT|3zd9l!y!oMSG~b z9~GHcAjtUWIx#vLS4A3xyJ?3h*I$H$E$=S#w-bI52bA(~i~kt%vYq{3LpJ}{kjiPS z4B{gjo8kX8B>(>!^5GQke3>@LpWS6!#N!=pwnu>O^`3<~35xu$@XJfuA*`P{TpHF$ z@{v4%H@90cba26Z#<)YpF;Y@m2VNQnOvJY1=NEz~e1NQo1olvLSS1FbkfY!$p;!#} z3w+QwIW!{9zj*sh$2M~Z|8FpTI*4HL8@7iq-(FCafRbUY76g$PnJ zu>Y5kN+^?Ve!Vbv$8Q#DG1UaA&`JYBNLut!Q&tUY`rX-oyBnxrd>KI`E_;4L*a$@o6Nui!mw2OKTV{ebHm%2%N361ZNvePgj5= z42Am|h+Nse#4q;@MsIWSkLvShKzb*(HMO#&8_i8C9DL=PPD&C!&<8CyvY7|}g2lza zA99xDv?1&L<>!I)_rW;RH`nRM;+(R$bou9Bee32e@%gmEMHBg=T8a{Ah_+_*1l{8o(Tzbiwcq~G;GQMXP<^%Wlk9x7*EoTpuIQEO*wJ4vBt|#! zk5Kv*v#`%`TD_MUXuO)0*Q1+~MSUT|G0C?6p=6C(Rcg|di_J_-&58dBVLX5{t#9c} zQr>)dGomNsk`nR?^VypJ85;3%$L?{z^^uI|kFd~Phr(U~$~X+EqEMqkbiw0L9!W)S z0hPUZm)Rb01bT$#nCVm!3&gs$MKJ`WLUhm>KytV}bs&mRS2S>Z z8NQ!KmTnLYO#Um8yzhLsK1IDpi`tN|FB3_J&0K6h;Q!;wb`f87qun>y~l}@V2 z#^{UZclQR8eHlUMi8s2^tmdOXc%-+02A zq)))O#_MBf7SEE@o0iAC`f8y7fia#CUb29|EK`$UsgBUhb$A!q;moh2mIm=rU?|9kXc|ksU?CY!kfsFKjws@SE diff --git a/contributors/design-proposals/identifiers.md b/contributors/design-proposals/architecture/identifiers.md similarity index 100% rename from contributors/design-proposals/identifiers.md rename to contributors/design-proposals/architecture/identifiers.md diff --git a/contributors/design-proposals/namespaces.md b/contributors/design-proposals/architecture/namespaces.md similarity index 100% rename from contributors/design-proposals/namespaces.md rename to contributors/design-proposals/architecture/namespaces.md diff --git a/contributors/design-proposals/api-machinery/principles.md b/contributors/design-proposals/architecture/principles.md similarity index 100% rename from contributors/design-proposals/api-machinery/principles.md rename to contributors/design-proposals/architecture/principles.md diff --git a/contributors/design-proposals/image-provenance.md b/contributors/design-proposals/auth/image-provenance.md similarity index 100% rename from contributors/design-proposals/image-provenance.md rename to contributors/design-proposals/auth/image-provenance.md diff --git a/contributors/design-proposals/secrets.md b/contributors/design-proposals/auth/secrets.md similarity index 100% rename from contributors/design-proposals/secrets.md rename to contributors/design-proposals/auth/secrets.md diff --git a/contributors/design-proposals/initial-resources.md b/contributors/design-proposals/autoscaling/initial-resources.md similarity index 100% rename from contributors/design-proposals/initial-resources.md rename to contributors/design-proposals/autoscaling/initial-resources.md diff --git a/contributors/design-proposals/ha_master.md b/contributors/design-proposals/cluster-lifecycle/ha_master.md similarity index 100% rename from contributors/design-proposals/ha_master.md rename to contributors/design-proposals/cluster-lifecycle/ha_master.md diff --git a/contributors/design-proposals/high-availability.md b/contributors/design-proposals/cluster-lifecycle/high-availability.md similarity index 100% rename from contributors/design-proposals/high-availability.md rename to contributors/design-proposals/cluster-lifecycle/high-availability.md diff --git a/contributors/design-proposals/node/kubelet-tls-bootstrap.md b/contributors/design-proposals/cluster-lifecycle/kubelet-tls-bootstrap.md similarity index 100% rename from contributors/design-proposals/node/kubelet-tls-bootstrap.md rename to contributors/design-proposals/cluster-lifecycle/kubelet-tls-bootstrap.md diff --git a/contributors/design-proposals/local-cluster-ux.md b/contributors/design-proposals/cluster-lifecycle/local-cluster-ux.md similarity index 100% rename from contributors/design-proposals/local-cluster-ux.md rename to contributors/design-proposals/cluster-lifecycle/local-cluster-ux.md diff --git a/contributors/design-proposals/node/runtimeconfig.md b/contributors/design-proposals/cluster-lifecycle/runtimeconfig.md similarity index 100% rename from contributors/design-proposals/node/runtimeconfig.md rename to contributors/design-proposals/cluster-lifecycle/runtimeconfig.md diff --git a/contributors/design-proposals/containerized-mounter.md~ b/contributors/design-proposals/containerized-mounter.md~ deleted file mode 100644 index b1c8f298f..000000000 --- a/contributors/design-proposals/containerized-mounter.md~ +++ /dev/null @@ -1,43 +0,0 @@ -# Containerized Mounter with Chroot for Container-Optimized OS - -## Goal - -Due security and management overhead, our new Container-Optimized OS used by GKE -does not carry certain storage drivers and tools needed for such as nfs and -glusterfs. This project takes a containerized mount approach to package mount -binaries into a container. Volume plugin will execute mount inside of container -and share the mount with the host. - - -## Design - -1. A docker image has storage tools (nfs and glusterfs) pre-installed and uploaded - to gcs. -2. During GKE cluster configuration, the docker image is pulled and installed on - the cluster node. -3. When nfs or glusterfs type mount is invoked by kubelet, it will run the mount - command inside of a container with the pre-install docker image and the mount - propagation set to “shared. In this way, the mount inside the container will - visible to host node too. -4. A special case for NFSv3, a rpcbind process is issued before running mount - command. - -## Implementation details - -* In the first version of containerized mounter, we use rkt fly to dynamically - start a container during mount. When mount command finishes, the container is - normally exited and will be garbage-collected. However, in case the glusterfs - mount, because a gluster daemon is running after command mount finishes util - glusterfs unmount, the container started for mount will continue to run until - glusterfs client finishes. The container cannot be garbage-collected right away - and multiple containers might be running for some time. Due to shared mount - propagation, with more containers running, the number of mounts will increase - significantly and might cause kernel panic. To solve this problem, a chroot - approach is proposed and implemented. -* In the second version, instead of running a container on the host, the docker - container’s file system is exported as a tar archive and pre-installed on host. - Kubelet directory is shared mount between host and inside of the container’s - rootfs. When a gluster/nfs mount is issued, a mounter script will use chroot to - change to the container’s rootfs and run the mount. This approach is very clean - since there is no need to manage a container’s lifecycle and avoid having large - number of mounts. diff --git a/contributors/design-proposals/dir_struct.txt b/contributors/design-proposals/dir_struct.txt index c5ecc717b..c06918c28 100644 --- a/contributors/design-proposals/dir_struct.txt +++ b/contributors/design-proposals/dir_struct.txt @@ -1,240 +1,244 @@ -Uncategorized (Please Help) - high-availability.md - control-plane-resilience.md - downward_api_resources_limits_requests.md - seccomp.md - client-package-structure.md - service-discovery.md - metadata-policy.md - containerized-mounter.md~ - identifiers.md - local-cluster-ux.md - pod-pid-namespace.md - grow-volume-size.md - image-provenance.md - core-metrics-pipeline.md - versioning.md - ha_master.md - secret-configmap-downwarapi-file-mode.md - protobuf.md - flakiness-sla.md - resources.md - initial-resources.md +Uncategorized + admission_control_event_rate_limit.md create_sheet.py - runtime-client-server.md - OWNERS - namespaces.md - cpu-manager.md - selinux-enhancements.md - sysctl.md + create_sheet.py~ + design_proposal_template.md dir_struct.txt - selinux.md - templates.md - pod-cache.png - README.md - multi-platform.md - pod-lifecycle-event-generator.md - secrets.md - cri-dockershim-checkpoint.md event_compression.md + multi-platform.md + owners pleg.png + readme.md + runtime-client-server.md + templates.md~ ./sig-cli + get-describe-apiserver-extensions.md + kubectl-create-from-env-file.md kubectl-extension.md + kubectl-login.md kubectl_apply_getsetdiff_last_applied_config.md multi-fields-merge-key.md - template.md - expansion.md - kubectl-login.md - simple-rolling-update.md - OWNERS - get-describe-apiserver-extensions.md + owners preserve-order-in-strategic-merge-patch.md - kubectl-create-from-env-file.md + simple-rolling-update.md ./network - flannel-integration.md - service-external-name.md - networking.md command_execution_port_forwarding.md - network-policy.md external-lb-source-ip-preservation.md + flannel-integration.md + network-policy.md + networking.md + selinux-enhancements.md + service-discovery.md + service-external-name.md ./resource-management + admission_control_limit_range.md + admission_control_resource_quota.md + device-plugin-overview.png device-plugin.md device-plugin.png gpu-support.md - device-plugin-overview.png + hugepages.md + resource-quota-scoping.md +./testing + flakiness-sla.md ./autoscaling - hpa-v2.md - hpa-status-conditions.md horizontal-pod-autoscaler.md + hpa-status-conditions.md + hpa-v2.md + initial-resources.md ./architecture architecture.md - architecture.dia architecture.png architecture.svg -./api-machinery - admission_control_extension.md - csi-client-structure-proposal.md - selector-generation.md - pod-safety.md - container-init.md - resource-quota-scoping.md - thirdpartyresources.md - aggregated-api-servers.md - extending-api.md - envvar-configmap.md - dynamic-admission-control-configuration.md - api-chunking.md - garbage-collection.md - customresources-validation.md - auditing.md - apiserver-watch.md - admission_control_limit_range.md - apiserver-build-in-admission-plugins.md - synchronous-garbage-collection.md - configmap.md - csi-new-client-library-procedure.md - pod-preset.md - add-new-patchStrategy-to-clear-fields-not-present-in-patch.md - api-group.md + identifiers.md + namespaces.md principles.md +./api-machinery + add-new-patchstrategy-to-clear-fields-not-present-in-patch.md admission_control.md - optional-configmap.md - server-get.md + admission_control_extension.md + aggregated-api-servers.md + api-chunking.md + api-group.md + apiserver-build-in-admission-plugins.md apiserver-count-fix.md - admission_control_resource_quota.md + apiserver-watch.md + auditing.md + bulk_watch.md + client-package-structure.md + controller-ref.md + csi-client-structure-proposal.md + csi-new-client-library-procedure.md + customresources-validation.md + dynamic-admission-control-configuration.md + extending-api.md + garbage-collection.md + metadata-policy.md + protobuf.md + server-get.md + synchronous-garbage-collection.md + thirdpartyresources.md ./node - pod-resource-management.md - kubelet-tls-bootstrap.md - dynamic-kubelet-configuration.md - kubelet-hypercontainer-runtime.md + all-in-one-volume.md + annotations-downward-api.md + configmap.md + container-init.md container-runtime-interface-v1.md - kubelet-authorizer.md + cpu-manager.md + cri-dockershim-checkpoint.md disk-accounting.md - kubelet-systemd.md - kubelet-cri-logging.md + downward_api_resources_limits_requests.md + dynamic-kubelet-configuration.md + envvar-configmap.md + expansion.md kubelet-auth.md - runtimeconfig.md + kubelet-authorizer.md + kubelet-cri-logging.md + kubelet-eviction.md + kubelet-hypercontainer-runtime.md + kubelet-rkt-runtime.md + kubelet-rootfs-distribution.md + kubelet-systemd.md + node-allocatable.md + optional-configmap.md + pod-cache.png + pod-lifecycle-event-generator.md + pod-pid-namespace.md + pod-resource-management.md + propagation.md resource-qos.md runtime-pod-cache.md - kubelet-rootfs-distribution.md - kubelet-rkt-runtime.md - node-allocatable.md - kubelet-eviction.md + seccomp.md + secret-configmap-downwardapi-file-mode.md + selinux.md + sysctl.md +./service-catalog + pod-preset.md ./instrumentation + core-metrics-pipeline.md + custom-metrics-api.md + metrics-server.md monitoring_architecture.md monitoring_architecture.png - custom-metrics-api.md - resource-metrics-api.md performance-related-monitoring.md - metrics-server.md + resource-metrics-api.md volume_stats_pvc_ref.md ./auth - security_context.md - no-new-privs.md access.md - enhance-pluggable-policy.md apparmor.md - security-context-constraints.md + enhance-pluggable-policy.md + image-provenance.md + no-new-privs.md pod-security-context.md - bulk_watch.md + secrets.md + security-context-constraints.md security.md + security_context.md service_accounts.md ./federation - federated-replicasets.md - ubernetes-design.png - ubernetes-cluster-state.png - federation-phase-1.md - federation-clusterselector.md - ubernetes-scheduling.png - federation-lite.md - federation.md - federated-services.md - federation-high-level-arch.png + control-plane-resilience.md federated-api-servers.md - federated-placement-policy.md federated-ingress.md + federated-placement-policy.md + federated-replicasets.md + federated-services.md + federation-clusterselector.md + federation-high-level-arch.png + federation-lite.md + federation-phase-1.md + federation.md + ubernetes-cluster-state.png + ubernetes-design.png + ubernetes-scheduling.png ./scalability - Kubemark_architecture.png - scalability-testing.md kubemark.md + kubemark_architecture.png + scalability-testing.md ./cluster-lifecycle - self-hosted-layers.png - self-hosted-kubernetes.md - dramatically-simplify-cluster-creation.md bootstrap-discovery.md cluster-deployment.md - self-hosted-kubelet.md clustering.md + dramatically-simplify-cluster-creation.md + ha_master.md + high-availability.md + kubelet-tls-bootstrap.md + local-cluster-ux.md + runtimeconfig.md self-hosted-final-cluster.png + self-hosted-kubelet.md + self-hosted-kubernetes.md + self-hosted-layers.png self-hosted-moving-parts.png ./cluster-lifecycle/clustering - static.png .gitignore - Dockerfile - static.seqdiag - dynamic.seqdiag - OWNERS - Makefile - README.md + dockerfile dynamic.png + dynamic.seqdiag + makefile + owners + readme.md + static.png + static.seqdiag ./release release-notes.md release-test-signal.md + versioning.md ./scheduling - rescheduling.md - rescheduler.md - nodeaffinity.md - podaffinity.md hugepages.md - taint-toleration-dedicated.md + multiple-schedulers.md + nodeaffinity.md pod-preemption.md pod-priority-api.md - taint-node-by-condition.md - scheduler_extender.md + podaffinity.md + rescheduler.md rescheduling-for-critical-pods.md - multiple-schedulers.md + rescheduling.md + resources.md + scheduler_extender.md + taint-node-by-condition.md + taint-toleration-dedicated.md +./scheduling/images + .gitignore + owners + preemption_1.png + preemption_2.png + preemption_3.png + preemption_4.png ./apps - daemonset-update.md - cronjob.md - annotations-downward-api.md - controller-ref.md - statefulset-update.md - stateful-apps.md - deploy.md - daemon.md controller_history.md - job.md - indexed-job.md + cronjob.md + daemon.md + daemonset-update.md + deploy.md deployment.md + indexed-job.md + job.md + obsolete_templates.md + selector-generation.md + stateful-apps.md + statefulset-update.md ./storage - flex-volumes-drivers-psp.md - local-storage-overview.md - all-in-one-volume.md - volume-selectors.md - persistent-storage.md - volume-metrics.md - flexvolume-deployment.md - volume-snapshotting.png - volume-provisioning.md - propagation.md - volume-ownership-management.md - mount-options.md - volumes.md + containerized-mounter.md default-storage-class.md - volume-snapshotting.md + flex-volumes-drivers-psp.md + flexvolume-deployment.md + grow-volume-size.md + local-storage-overview.md + mount-options.md + persistent-storage.md + pod-safety.md volume-hostpath-qualifiers.md + volume-metrics.md + volume-ownership-management.md + volume-provisioning.md + volume-selectors.md + volume-snapshotting.md + volume-snapshotting.png + volumes.md ./aws aws_under_the_hood.md -./images - preemption_1.png - preemption_3.png - .gitignore - OWNERS - preemption_2.png - preemption_4.png ./gcp gce-l4-loadbalancer-healthcheck.md - containerized-mounter.md ./cloud-provider - cloudprovider-storage-metrics.md cloud-provider-refactoring.md + cloudprovider-storage-metrics.md diff --git a/contributors/design-proposals/control-plane-resilience.md b/contributors/design-proposals/federation/control-plane-resilience.md similarity index 100% rename from contributors/design-proposals/control-plane-resilience.md rename to contributors/design-proposals/federation/control-plane-resilience.md diff --git a/contributors/design-proposals/core-metrics-pipeline.md b/contributors/design-proposals/instrumentation/core-metrics-pipeline.md similarity index 100% rename from contributors/design-proposals/core-metrics-pipeline.md rename to contributors/design-proposals/instrumentation/core-metrics-pipeline.md diff --git a/contributors/design-proposals/selinux-enhancements.md b/contributors/design-proposals/network/selinux-enhancements.md similarity index 100% rename from contributors/design-proposals/selinux-enhancements.md rename to contributors/design-proposals/network/selinux-enhancements.md diff --git a/contributors/design-proposals/service-discovery.md b/contributors/design-proposals/network/service-discovery.md similarity index 100% rename from contributors/design-proposals/service-discovery.md rename to contributors/design-proposals/network/service-discovery.md diff --git a/contributors/design-proposals/storage/all-in-one-volume.md b/contributors/design-proposals/node/all-in-one-volume.md similarity index 100% rename from contributors/design-proposals/storage/all-in-one-volume.md rename to contributors/design-proposals/node/all-in-one-volume.md diff --git a/contributors/design-proposals/apps/annotations-downward-api.md b/contributors/design-proposals/node/annotations-downward-api.md similarity index 100% rename from contributors/design-proposals/apps/annotations-downward-api.md rename to contributors/design-proposals/node/annotations-downward-api.md diff --git a/contributors/design-proposals/api-machinery/configmap.md b/contributors/design-proposals/node/configmap.md similarity index 100% rename from contributors/design-proposals/api-machinery/configmap.md rename to contributors/design-proposals/node/configmap.md diff --git a/contributors/design-proposals/api-machinery/container-init.md b/contributors/design-proposals/node/container-init.md similarity index 100% rename from contributors/design-proposals/api-machinery/container-init.md rename to contributors/design-proposals/node/container-init.md diff --git a/contributors/design-proposals/cpu-manager.md b/contributors/design-proposals/node/cpu-manager.md similarity index 100% rename from contributors/design-proposals/cpu-manager.md rename to contributors/design-proposals/node/cpu-manager.md diff --git a/contributors/design-proposals/cri-dockershim-checkpoint.md b/contributors/design-proposals/node/cri-dockershim-checkpoint.md similarity index 100% rename from contributors/design-proposals/cri-dockershim-checkpoint.md rename to contributors/design-proposals/node/cri-dockershim-checkpoint.md diff --git a/contributors/design-proposals/downward_api_resources_limits_requests.md b/contributors/design-proposals/node/downward_api_resources_limits_requests.md similarity index 100% rename from contributors/design-proposals/downward_api_resources_limits_requests.md rename to contributors/design-proposals/node/downward_api_resources_limits_requests.md diff --git a/contributors/design-proposals/api-machinery/envvar-configmap.md b/contributors/design-proposals/node/envvar-configmap.md similarity index 100% rename from contributors/design-proposals/api-machinery/envvar-configmap.md rename to contributors/design-proposals/node/envvar-configmap.md diff --git a/contributors/design-proposals/sig-cli/expansion.md b/contributors/design-proposals/node/expansion.md similarity index 100% rename from contributors/design-proposals/sig-cli/expansion.md rename to contributors/design-proposals/node/expansion.md diff --git a/contributors/design-proposals/api-machinery/optional-configmap.md b/contributors/design-proposals/node/optional-configmap.md similarity index 100% rename from contributors/design-proposals/api-machinery/optional-configmap.md rename to contributors/design-proposals/node/optional-configmap.md diff --git a/contributors/design-proposals/pod-cache.png b/contributors/design-proposals/node/pod-cache.png similarity index 100% rename from contributors/design-proposals/pod-cache.png rename to contributors/design-proposals/node/pod-cache.png diff --git a/contributors/design-proposals/pod-lifecycle-event-generator.md b/contributors/design-proposals/node/pod-lifecycle-event-generator.md similarity index 100% rename from contributors/design-proposals/pod-lifecycle-event-generator.md rename to contributors/design-proposals/node/pod-lifecycle-event-generator.md diff --git a/contributors/design-proposals/pod-pid-namespace.md b/contributors/design-proposals/node/pod-pid-namespace.md similarity index 100% rename from contributors/design-proposals/pod-pid-namespace.md rename to contributors/design-proposals/node/pod-pid-namespace.md diff --git a/contributors/design-proposals/storage/propagation.md b/contributors/design-proposals/node/propagation.md similarity index 100% rename from contributors/design-proposals/storage/propagation.md rename to contributors/design-proposals/node/propagation.md diff --git a/contributors/design-proposals/seccomp.md b/contributors/design-proposals/node/seccomp.md similarity index 100% rename from contributors/design-proposals/seccomp.md rename to contributors/design-proposals/node/seccomp.md diff --git a/contributors/design-proposals/secret-configmap-downwarapi-file-mode.md b/contributors/design-proposals/node/secret-configmap-downwardapi-file-mode.md similarity index 100% rename from contributors/design-proposals/secret-configmap-downwarapi-file-mode.md rename to contributors/design-proposals/node/secret-configmap-downwardapi-file-mode.md diff --git a/contributors/design-proposals/selinux.md b/contributors/design-proposals/node/selinux.md similarity index 100% rename from contributors/design-proposals/selinux.md rename to contributors/design-proposals/node/selinux.md diff --git a/contributors/design-proposals/sysctl.md b/contributors/design-proposals/node/sysctl.md similarity index 100% rename from contributors/design-proposals/sysctl.md rename to contributors/design-proposals/node/sysctl.md diff --git a/contributors/design-proposals/versioning.md b/contributors/design-proposals/release/versioning.md similarity index 100% rename from contributors/design-proposals/versioning.md rename to contributors/design-proposals/release/versioning.md diff --git a/contributors/design-proposals/api-machinery/admission_control_limit_range.md b/contributors/design-proposals/resource-management/admission_control_limit_range.md similarity index 100% rename from contributors/design-proposals/api-machinery/admission_control_limit_range.md rename to contributors/design-proposals/resource-management/admission_control_limit_range.md diff --git a/contributors/design-proposals/api-machinery/admission_control_resource_quota.md b/contributors/design-proposals/resource-management/admission_control_resource_quota.md similarity index 100% rename from contributors/design-proposals/api-machinery/admission_control_resource_quota.md rename to contributors/design-proposals/resource-management/admission_control_resource_quota.md diff --git a/contributors/design-proposals/resource-management/hugepages.md b/contributors/design-proposals/resource-management/hugepages.md new file mode 100644 index 000000000..27e5c5af5 --- /dev/null +++ b/contributors/design-proposals/resource-management/hugepages.md @@ -0,0 +1,308 @@ +# HugePages support in Kubernetes + +**Authors** +* Derek Carr (@derekwaynecarr) +* Seth Jennings (@sjenning) +* Piotr Prokop (@PiotrProkop) + +**Status**: In progress + +## Abstract + +A proposal to enable applications running in a Kubernetes cluster to use huge +pages. + +A pod may request a number of huge pages. The `scheduler` is able to place the +pod on a node that can satisfy that request. The `kubelet` advertises an +allocatable number of huge pages to support scheduling decisions. A pod may +consume hugepages via `hugetlbfs` or `shmget`. Huge pages are not +overcommitted. + +## Motivation + +Memory is managed in blocks known as pages. On most systems, a page is 4Ki. 1Mi +of memory is equal to 256 pages; 1Gi of memory is 256,000 pages, etc. CPUs have +a built-in memory management unit that manages a list of these pages in +hardware. The Translation Lookaside Buffer (TLB) is a small hardware cache of +virtual-to-physical page mappings. If the virtual address passed in a hardware +instruction can be found in the TLB, the mapping can be determined quickly. If +not, a TLB miss occurs, and the system falls back to slower, software based +address translation. This results in performance issues. Since the size of the +TLB is fixed, the only way to reduce the chance of a TLB miss is to increase the +page size. + +A huge page is a memory page that is larger than 4Ki. On x86_64 architectures, +there are two common huge page sizes: 2Mi and 1Gi. Sizes vary on other +architectures, but the idea is the same. In order to use huge pages, +application must write code that is aware of them. Transparent huge pages (THP) +attempts to automate the management of huge pages without application knowledge, +but they have limitations. In particular, they are limited to 2Mi page sizes. +THP might lead to performance degradation on nodes with high memory utilization +or fragmentation due to defragmenting efforts of THP, which can lock memory +pages. For this reason, some applications may be designed to (or recommend) +usage of pre-allocated huge pages instead of THP. + +Managing memory is hard, and unfortunately, there is no one-size fits all +solution for all applications. + +## Scope + +This proposal only includes pre-allocated huge pages configured on the node by +the administrator at boot time or by manual dynamic allocation. It does not +discuss how the cluster could dynamically attempt to allocate huge pages in an +attempt to find a fit for a pod pending scheduling. It is anticipated that +operators may use a variety of strategies to allocate huge pages, but we do not +anticipate the kubelet itself doing the allocation. Allocation of huge pages +ideally happens soon after boot time. + +This proposal defers issues relating to NUMA. + +## Use Cases + +The class of applications that benefit from huge pages typically have +- A large memory working set +- A sensitivity to memory access latency + +Example applications include: +- database management systems (MySQL, PostgreSQL, MongoDB, Oracle, etc.) +- Java applications can back the heap with huge pages using the + `-XX:+UseLargePages` and `-XX:LagePageSizeInBytes` options. +- packet processing systems (DPDK) + +Applications can generally use huge pages by calling +- `mmap()` with `MAP_ANONYMOUS | MAP_HUGETLB` and use it as anonymous memory +- `mmap()` a file backed by `hugetlbfs` +- `shmget()` with `SHM_HUGETLB` and use it as a shared memory segment (see Known + Issues). + +1. A pod can use huge pages with any of the prior described methods. +1. A pod can request huge pages. +1. A scheduler can bind pods to nodes that have available huge pages. +1. A quota may limit usage of huge pages. +1. A limit range may constrain min and max huge page requests. + +## Feature Gate + +The proposal introduces huge pages as an Alpha feature. + +It must be enabled via the `--feature-gates=HugePages=true` flag on pertinent +components pending graduation to Beta. + +## Node Specfication + +Huge pages cannot be overcommitted on a node. + +A system may support multiple huge page sizes. It is assumed that most nodes +will be configured to primarily use the default huge page size as returned via +`grep Hugepagesize /proc/meminfo`. This defaults to 2Mi on most Linux systems +unless overriden by `default_hugepagesz=1g` in kernel boot parameters. + +For each supported huge page size, the node will advertise a resource of the +form `hugepages-`. On Linux, supported huge page sizes are +determined by parsing the `/sys/kernel/mm/hugepages/hugepages-{size}kB` +directory on the host. Kubernetes will expose a `hugepages-` +resource using binary notation form. It will convert `` into the +most compact binary notation using integer values. For example, if a node +supports `hugepages-2048kB`, a resource `hugepages-2Mi` will be shown in node +capacity and allocatable values. Operators may set aside pre-allocated huge +pages that are not available for user pods similar to normal memory via the +`--system-reserved` flag. + +There are a variety of huge page sizes supported across different hardware +architectures. It is preferred to have a resource per size in order to better +support quota. For example, 1 huge page with size 2Mi is orders of magnitude +different than 1 huge page with size 1Gi. We assume gigantic pages are even +more precious resources than huge pages. + +Pre-allocated huge pages reduce the amount of allocatable memory on a node. The +node will treat pre-allocated huge pages similar to other system reservations +and reduce the amount of `memory` it reports using the following formula: + +``` +[Allocatable] = [Node Capacity] - + [Kube-Reserved] - + [System-Reserved] - + [Pre-Allocated-HugePages * HugePageSize] - + [Hard-Eviction-Threshold] +``` + +The following represents a machine with 10Gi of memory. 1Gi of memory has been +reserved as 512 pre-allocated huge pages sized 2Mi. As you can see, the +allocatable memory has been reduced to account for the amount of huge pages +reserved. + +``` +apiVersion: v1 +kind: Node +metadata: + name: node1 +... +status: + capacity: + memory: 10Gi + hugepages-2Mi: 1Gi + allocatable: + memory: 9Gi + hugepages-2Mi: 1Gi +... +``` + +## Pod Specification + +A pod must make a request to consume pre-allocated huge pages using the resource +`hugepages-` whose quantity is a positive amount of memory in +bytes. The specified amount must align with the ``; otherwise, +the pod will fail validation. For example, it would be valid to request +`hugepages-2Mi: 4Mi`, but invalid to request `hugepages-2Mi: 3Mi`. + +The request and limit for `hugepages-` must match. Similar to +memory, an application that requests `hugepages-` resource is at +minimum in the `Burstable` QoS class. + +If a pod consumes huge pages via `shmget`, it must run with a supplemental group +that matches `/proc/sys/vm/hugetlb_shm_group` on the node. Configuration of +this group is outside the scope of this specification. + +Initially, a pod may not consume multiple huge page sizes in a single pod spec. +Attempting to use `hugepages-2Mi` and `hugepages-1Gi` in the same pod spec will +fail validation. We believe it is rare for applications to attempt to use +multiple huge page sizes. This restriction may be lifted in the future with +community presented use cases. Introducing the feature with this restriction +limits the exposure of API changes needed when consuming huge pages via volumes. + +In order to consume huge pages backed by the `hugetlbfs` filesystem inside the +specified container in the pod, it is helpful to understand the set of mount +options used with `hugetlbfs`. For more details, see "Using Huge Pages" here: +https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + +``` +mount -t hugetlbfs \ + -o uid=,gid=,mode=,pagesize=,size=,\ + min_size=,nr_inodes= none /mnt/huge +``` + +The proposal recommends extending the existing `EmptyDirVolumeSource` to satisfy +this use case. A new `medium=HugePages` option would be supported. To write +into this volume, the pod must make a request for huge pages. The `pagesize` +argument is inferred from the `hugepages-` from the resource +request. If in the future, multiple huge page sizes are supported in a single +pod spec, we may modify the `EmptyDirVolumeSource` to provide an optional page +size. The existing `sizeLimit` option for `emptyDir` would restrict usage to +the minimum value specified between `sizeLimit` and the sum of huge page limits +of all containers in a pod. This keeps the behavior consistent with memory +backed `emptyDir` volumes whose usage is ultimately constrained by the pod +cgroup sandbox memory settings. The `min_size` option is omitted as its not +necessary. The `nr_inodes` mount option is omitted at this time in the same +manner it is omitted with `medium=Memory` when using `tmpfs`. + +The following is a sample pod that is limited to 1Gi huge pages of size 2Mi. It +can consume those pages using `shmget()` or via `mmap()` with the specified +volume. + +``` +apiVersion: v1 +kind: Pod +metadata: + name: example +spec: + containers: +... + volumeMounts: + - mountPath: /hugepages + name: hugepage + resources: + requests: + hugepages-2Mi: 1Gi + limits: + hugepages-2Mi: 1Gi + volumes: + - name: hugepage + emptyDir: + medium: HugePages +``` + +## CRI Updates + +The `LinuxContainerResources` message should be extended to support specifying +huge page limits per size. The specification for huge pages should align with +opencontainers/runtime-spec. + +see: +https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#huge-page-limits + +The CRI changes are required before promoting this feature to Beta. + +## Cgroup Enforcement + +To use this feature, the `--cgroups-per-qos` must be enabled. In addition, the +`hugetlb` cgroup must be mounted. + +The `kubepods` cgroup is bounded by the `Allocatable` value. + +The QoS level cgroups are left unbounded across all huge page pool sizes. + +The pod level cgroup sandbox is configured as follows, where `hugepagesize` is +the system supported huge page size(s). If no request is made for huge pages of +a particular size, the limit is set to 0 for all supported types on the node. + +``` +pod/hugetlb..limit_in_bytes = sum(pod.spec.containers.resources.limits[hugepages-]) +``` + +If the container runtime supports specification of huge page limits, the +container cgroup sandbox will be configured with the specified limit. + +The `kubelet` will ensure the `hugetlb` has no usage charged to the pod level +cgroup sandbox prior to deleting the pod to ensure all resources are reclaimed. + +## Limits and Quota + +The `ResourceQuota` resource will be extended to support accounting for +`hugepages-` similar to `cpu` and `memory`. The `LimitRange` +resource will be extended to define min and max constraints for `hugepages` +similar to `cpu` and `memory`. + +## Scheduler changes + +The scheduler will need to ensure any huge page request defined in the pod spec +can be fulfilled by a candidate node. + +## cAdvisor changes + +cAdvisor will need to be modified to return the number of pre-allocated huge +pages per page size on the node. It will be used to determine capacity and +calculate allocatable values on the node. + +## Roadmap + +### Version 1.8 + +Initial alpha support for huge pages usage by pods. + +### Version 1.9 + +Resource Quota support. Limit Range support. Beta support for huge pages +(pending community feedback) + +## Known Issues + +### Huge pages as shared memory + +For the Java use case, the JVM maps the huge pages as a shared memory segment +and memlocks them to prevent the system from moving or swapping them out. + +There are several issues here: +- The user running the Java app must be a member of the gid set in the + `vm.huge_tlb_shm_group` sysctl +- sysctl `kernel.shmmax` must allow the size of the shared memory segment +- The user's memlock ulimits must allow the size of the shared memory segment +- `vm.huge_tlb_shm_group` is not namespaced. + +### NUMA + +NUMA is complicated. To support NUMA, the node must support cpu pinning, +devices, and memory locality. Extending that requirement to huge pages is not +much different. It is anticipated that the `kubelet` will provide future NUMA +locality guarantees as a feature of QoS. In particular, pods in the +`Guaranteed` QoS class are expected to have NUMA locality preferences. + diff --git a/contributors/design-proposals/api-machinery/resource-quota-scoping.md b/contributors/design-proposals/resource-management/resource-quota-scoping.md similarity index 100% rename from contributors/design-proposals/api-machinery/resource-quota-scoping.md rename to contributors/design-proposals/resource-management/resource-quota-scoping.md diff --git a/contributors/design-proposals/images/.gitignore b/contributors/design-proposals/scheduling/images/.gitignore similarity index 100% rename from contributors/design-proposals/images/.gitignore rename to contributors/design-proposals/scheduling/images/.gitignore diff --git a/contributors/design-proposals/images/OWNERS b/contributors/design-proposals/scheduling/images/OWNERS similarity index 100% rename from contributors/design-proposals/images/OWNERS rename to contributors/design-proposals/scheduling/images/OWNERS diff --git a/contributors/design-proposals/images/preemption_1.png b/contributors/design-proposals/scheduling/images/preemption_1.png similarity index 100% rename from contributors/design-proposals/images/preemption_1.png rename to contributors/design-proposals/scheduling/images/preemption_1.png diff --git a/contributors/design-proposals/images/preemption_2.png b/contributors/design-proposals/scheduling/images/preemption_2.png similarity index 100% rename from contributors/design-proposals/images/preemption_2.png rename to contributors/design-proposals/scheduling/images/preemption_2.png diff --git a/contributors/design-proposals/images/preemption_3.png b/contributors/design-proposals/scheduling/images/preemption_3.png similarity index 100% rename from contributors/design-proposals/images/preemption_3.png rename to contributors/design-proposals/scheduling/images/preemption_3.png diff --git a/contributors/design-proposals/images/preemption_4.png b/contributors/design-proposals/scheduling/images/preemption_4.png similarity index 100% rename from contributors/design-proposals/images/preemption_4.png rename to contributors/design-proposals/scheduling/images/preemption_4.png diff --git a/contributors/design-proposals/resources.md b/contributors/design-proposals/scheduling/resources.md similarity index 100% rename from contributors/design-proposals/resources.md rename to contributors/design-proposals/scheduling/resources.md diff --git a/contributors/design-proposals/api-machinery/pod-preset.md b/contributors/design-proposals/service-catalog/pod-preset.md similarity index 100% rename from contributors/design-proposals/api-machinery/pod-preset.md rename to contributors/design-proposals/service-catalog/pod-preset.md diff --git a/contributors/design-proposals/gcp/containerized-mounter.md b/contributors/design-proposals/storage/containerized-mounter.md similarity index 100% rename from contributors/design-proposals/gcp/containerized-mounter.md rename to contributors/design-proposals/storage/containerized-mounter.md diff --git a/contributors/design-proposals/grow-volume-size.md b/contributors/design-proposals/storage/grow-volume-size.md similarity index 100% rename from contributors/design-proposals/grow-volume-size.md rename to contributors/design-proposals/storage/grow-volume-size.md diff --git a/contributors/design-proposals/api-machinery/pod-safety.md b/contributors/design-proposals/storage/pod-safety.md similarity index 100% rename from contributors/design-proposals/api-machinery/pod-safety.md rename to contributors/design-proposals/storage/pod-safety.md diff --git a/contributors/design-proposals/flakiness-sla.md b/contributors/design-proposals/testing/flakiness-sla.md similarity index 100% rename from contributors/design-proposals/flakiness-sla.md rename to contributors/design-proposals/testing/flakiness-sla.md