Mount SSH Secret directly on main container (#416)
Remove the init container for faster startup. Possible by disabling StrictModes in sshd_config.
This commit is contained in:
parent
0bccdb9672
commit
8f5bbd8203
|
|
@ -69,9 +69,13 @@ RUN apt-get install -y --no-install-recommends openssh-client openssh-server &&
|
|||
mkdir -p /var/run/sshd
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
# by disabling StrictHostKeyChecking.
|
||||
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
|
||||
# to disable UserKnownHostsFile to avoid write permissions.
|
||||
# Disabling StrictModes avoids directory and files read permission checks.
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
|
||||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
|
||||
|
||||
WORKDIR "/examples"
|
||||
|
||||
|
|
|
|||
|
|
@ -24,5 +24,12 @@ RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
|
|||
RUN useradd -m mpiuser
|
||||
WORKDIR /home/mpiuser
|
||||
COPY --chown=mpiuser sshd_config .sshd_config
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
# by disabling StrictHostKeyChecking.
|
||||
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
|
||||
# to disable UserKnownHostsFile to avoid write permissions.
|
||||
# Disabling StrictModes avoids directory and files read permission checks.
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
|
||||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
|
||||
COPY --from=builder /pi /home/mpiuser/pi
|
||||
|
|
@ -52,6 +52,13 @@ WORKDIR /home/mpiuser
|
|||
COPY intel-entrypoint.sh /entrypoint.sh
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
COPY --chown=mpiuser sshd_config .sshd_config
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
# by disabling StrictHostKeyChecking.
|
||||
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
|
||||
# to disable UserKnownHostsFile to avoid write permissions.
|
||||
# Disabling StrictModes avoids directory and files read permission checks.
|
||||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
|
||||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
|
||||
|
||||
COPY --from=builder /pi /home/mpiuser/pi
|
||||
|
|
@ -1,2 +1,3 @@
|
|||
PidFile /home/mpiuser/sshd.pid
|
||||
HostKey /home/mpiuser/.ssh/id_rsa
|
||||
StrictModes no
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
FROM horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.6.0.post0-py3.7-cuda10.1
|
||||
|
||||
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
|
||||
# to disable UserKnownHostsFile to avoid write permissions.
|
||||
# Disabling StrictModes avoids directory and files read permission checks.
|
||||
RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
|
||||
|
||||
RUN mkdir /tensorflow
|
||||
WORKDIR "/tensorflow"
|
||||
RUN git clone https://github.com/tensorflow/benchmarks
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ type ServerOption struct {
|
|||
LockNamespace string
|
||||
QPS int
|
||||
Burst int
|
||||
ScriptingImage string
|
||||
}
|
||||
|
||||
// NewServerOption creates a new CMServer with a default config.
|
||||
|
|
@ -69,6 +68,4 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) {
|
|||
|
||||
fs.IntVar(&s.QPS, "kube-api-qps", 5, "QPS indicates the maximum QPS to the master from this client.")
|
||||
fs.IntVar(&s.Burst, "kube-api-burst", 10, "Maximum burst for throttle.")
|
||||
|
||||
fs.StringVar(&s.ScriptingImage, "scripting-image", "alpine:3.14", "Container image used for scripting, such as in init containers.")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -161,8 +161,7 @@ func Run(opt *options.ServerOption) error {
|
|||
kubeInformerFactory.Core().V1().Pods(),
|
||||
podgroupsInformer,
|
||||
kubeflowInformerFactory.Kubeflow().V2beta1().MPIJobs(),
|
||||
opt.GangSchedulingName,
|
||||
opt.ScriptingImage)
|
||||
opt.GangSchedulingName)
|
||||
|
||||
go kubeInformerFactory.Start(ctx.Done())
|
||||
go kubeflowInformerFactory.Start(ctx.Done())
|
||||
|
|
|
|||
|
|
@ -73,9 +73,7 @@ const (
|
|||
discoverHostsScriptName = "discover_hosts.sh"
|
||||
sshAuthSecretSuffix = "-ssh"
|
||||
sshAuthVolume = "ssh-auth"
|
||||
sshAuthMountPath = "/mnt/ssh"
|
||||
sshHomeInitMountPath = "/mnt/home-ssh"
|
||||
sshHomeVolume = "ssh-home"
|
||||
rootSSHPath = "/root/.ssh"
|
||||
launcher = "launcher"
|
||||
worker = "worker"
|
||||
launcherSuffix = "-launcher"
|
||||
|
|
@ -242,8 +240,6 @@ type MPIJobController struct {
|
|||
recorder record.EventRecorder
|
||||
// Gang scheduler name to use
|
||||
gangSchedulerName string
|
||||
// Container image used for scripting.
|
||||
scriptingImage string
|
||||
|
||||
// To allow injection of updateStatus for testing.
|
||||
updateStatusHandler func(mpijob *kubeflow.MPIJob) error
|
||||
|
|
@ -261,7 +257,7 @@ func NewMPIJobController(
|
|||
podInformer coreinformers.PodInformer,
|
||||
podgroupsInformer podgroupsinformer.PodGroupInformer,
|
||||
mpiJobInformer informers.MPIJobInformer,
|
||||
gangSchedulerName, scriptingImage string) *MPIJobController {
|
||||
gangSchedulerName string) *MPIJobController {
|
||||
|
||||
// Create event broadcaster.
|
||||
klog.V(4).Info("Creating event broadcaster")
|
||||
|
|
@ -298,7 +294,6 @@ func NewMPIJobController(
|
|||
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "MPIJobs"),
|
||||
recorder: recorder,
|
||||
gangSchedulerName: gangSchedulerName,
|
||||
scriptingImage: scriptingImage,
|
||||
}
|
||||
|
||||
controller.updateStatusHandler = controller.doUpdateJobStatus
|
||||
|
|
@ -1516,57 +1511,28 @@ func workerReplicas(job *kubeflow.MPIJob) int32 {
|
|||
}
|
||||
|
||||
func (c *MPIJobController) setupSSHOnPod(podSpec *corev1.PodSpec, job *kubeflow.MPIJob) {
|
||||
var mode *int32
|
||||
if job.Spec.SSHAuthMountPath == rootSSHPath {
|
||||
mode = newInt32(0600)
|
||||
}
|
||||
mainContainer := &podSpec.Containers[0]
|
||||
podSpec.Volumes = append(podSpec.Volumes,
|
||||
corev1.Volume{
|
||||
Name: sshAuthVolume,
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
Secret: &corev1.SecretVolumeSource{
|
||||
SecretName: job.Name + sshAuthSecretSuffix,
|
||||
Items: sshVolumeItems,
|
||||
DefaultMode: mode,
|
||||
SecretName: job.Name + sshAuthSecretSuffix,
|
||||
Items: sshVolumeItems,
|
||||
},
|
||||
},
|
||||
},
|
||||
corev1.Volume{
|
||||
Name: sshHomeVolume,
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
})
|
||||
|
||||
mainContainer := &podSpec.Containers[0]
|
||||
mainContainer.VolumeMounts = append(mainContainer.VolumeMounts,
|
||||
corev1.VolumeMount{
|
||||
Name: sshHomeVolume,
|
||||
Name: sshAuthVolume,
|
||||
MountPath: job.Spec.SSHAuthMountPath,
|
||||
})
|
||||
|
||||
// The init script sets the permissions of the ssh folder in the user's home
|
||||
// directory. The ownership is set based on the security context of the
|
||||
// launcher's first container.
|
||||
launcherSecurityCtx := job.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext
|
||||
initScript := "" +
|
||||
"cp -RL /mnt/ssh/* /mnt/home-ssh && " +
|
||||
"chmod 700 /mnt/home-ssh && " +
|
||||
"chmod 600 /mnt/home-ssh/*"
|
||||
if launcherSecurityCtx != nil && launcherSecurityCtx.RunAsUser != nil {
|
||||
initScript += fmt.Sprintf(" && chown %d -R /mnt/home-ssh", *launcherSecurityCtx.RunAsUser)
|
||||
}
|
||||
podSpec.InitContainers = append(podSpec.InitContainers, corev1.Container{
|
||||
Name: "init-ssh",
|
||||
Image: c.scriptingImage,
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{
|
||||
Name: sshAuthVolume,
|
||||
MountPath: sshAuthMountPath,
|
||||
},
|
||||
{
|
||||
Name: sshHomeVolume,
|
||||
MountPath: sshHomeInitMountPath,
|
||||
},
|
||||
},
|
||||
Command: []string{"/bin/sh"},
|
||||
Args: []string{"-c", initScript},
|
||||
})
|
||||
}
|
||||
|
||||
func ownerReferenceAndGVK(object metav1.Object) (*metav1.OwnerReference, schema.GroupVersionKind, error) {
|
||||
|
|
|
|||
|
|
@ -54,10 +54,6 @@ var (
|
|||
ignoreSecretEntries = cmpopts.IgnoreMapEntries(func(k string, v []uint8) bool { return true })
|
||||
)
|
||||
|
||||
const (
|
||||
scriptingImage = "alpine"
|
||||
)
|
||||
|
||||
type fixture struct {
|
||||
t *testing.T
|
||||
|
||||
|
|
@ -171,7 +167,6 @@ func (f *fixture) newController(gangSchedulerName string) (*MPIJobController, in
|
|||
podgroupsInformer,
|
||||
i.Kubeflow().V2beta1().MPIJobs(),
|
||||
gangSchedulerName,
|
||||
scriptingImage,
|
||||
)
|
||||
|
||||
c.configMapSynced = alwaysReady
|
||||
|
|
@ -1015,42 +1010,22 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
corev1.EnvVar{Name: openMPISlotsEnv, Value: "1"},
|
||||
nvidiaDisableEnvVars),
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-home", MountPath: "/root/.ssh"},
|
||||
{Name: "ssh-auth", MountPath: "/root/.ssh"},
|
||||
{Name: "mpi-job-config", MountPath: "/etc/mpi"},
|
||||
},
|
||||
},
|
||||
},
|
||||
InitContainers: []corev1.Container{
|
||||
{
|
||||
Name: "init-ssh",
|
||||
Image: scriptingImage,
|
||||
Command: []string{"/bin/sh"},
|
||||
Args: []string{
|
||||
"-c",
|
||||
"cp -RL /mnt/ssh/* /mnt/home-ssh && chmod 700 /mnt/home-ssh && chmod 600 /mnt/home-ssh/*",
|
||||
},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-auth", MountPath: "/mnt/ssh"},
|
||||
{Name: "ssh-home", MountPath: "/mnt/home-ssh"},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []corev1.Volume{
|
||||
{
|
||||
Name: "ssh-auth",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
Secret: &corev1.SecretVolumeSource{
|
||||
SecretName: "foo-ssh",
|
||||
Items: sshVolumeItems,
|
||||
DefaultMode: newInt32(0600),
|
||||
SecretName: "foo-ssh",
|
||||
Items: sshVolumeItems,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ssh-home",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "mpi-job-config",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
|
|
@ -1086,42 +1061,22 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
{
|
||||
Command: []string{"/usr/sbin/sshd", "-De"},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-home", MountPath: "/root/.ssh"},
|
||||
{Name: "ssh-auth", MountPath: "/root/.ssh"},
|
||||
},
|
||||
Env: workerEnvVars,
|
||||
},
|
||||
},
|
||||
InitContainers: []corev1.Container{
|
||||
{
|
||||
Name: "init-ssh",
|
||||
Image: scriptingImage,
|
||||
Command: []string{"/bin/sh"},
|
||||
Args: []string{
|
||||
"-c",
|
||||
"cp -RL /mnt/ssh/* /mnt/home-ssh && chmod 700 /mnt/home-ssh && chmod 600 /mnt/home-ssh/*",
|
||||
},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-auth", MountPath: "/mnt/ssh"},
|
||||
{Name: "ssh-home", MountPath: "/mnt/home-ssh"},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []corev1.Volume{
|
||||
{
|
||||
Name: "ssh-auth",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
Secret: &corev1.SecretVolumeSource{
|
||||
SecretName: "foo-ssh",
|
||||
Items: sshVolumeItems,
|
||||
DefaultMode: newInt32(0600),
|
||||
SecretName: "foo-ssh",
|
||||
Items: sshVolumeItems,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ssh-home",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
@ -1225,27 +1180,12 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
nvidiaDisableEnvVars),
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "fool-vol", MountPath: "/mnt/foo"},
|
||||
{Name: "ssh-home", MountPath: "/home/mpiuser/.ssh"},
|
||||
{Name: "ssh-auth", MountPath: "/home/mpiuser/.ssh"},
|
||||
{Name: "mpi-job-config", MountPath: "/etc/mpi"},
|
||||
},
|
||||
},
|
||||
{},
|
||||
},
|
||||
InitContainers: []corev1.Container{
|
||||
{
|
||||
Name: "init-ssh",
|
||||
Image: scriptingImage,
|
||||
Command: []string{"/bin/sh"},
|
||||
Args: []string{
|
||||
"-c",
|
||||
"cp -RL /mnt/ssh/* /mnt/home-ssh && chmod 700 /mnt/home-ssh && chmod 600 /mnt/home-ssh/* && chown 1000 -R /mnt/home-ssh",
|
||||
},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-auth", MountPath: "/mnt/ssh"},
|
||||
{Name: "ssh-home", MountPath: "/mnt/home-ssh"},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []corev1.Volume{
|
||||
{Name: "foo-vol"},
|
||||
{
|
||||
|
|
@ -1257,12 +1197,6 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ssh-home",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "mpi-job-config",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
|
|
@ -1298,26 +1232,11 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
{
|
||||
Command: []string{"/entrypoint.sh"},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-home", MountPath: "/home/mpiuser/.ssh"},
|
||||
{Name: "ssh-auth", MountPath: "/home/mpiuser/.ssh"},
|
||||
},
|
||||
Env: joinEnvVars(corev1.EnvVar{Name: "FOO", Value: "bar"}, workerEnvVars),
|
||||
},
|
||||
},
|
||||
InitContainers: []corev1.Container{
|
||||
{
|
||||
Name: "init-ssh",
|
||||
Image: scriptingImage,
|
||||
Command: []string{"/bin/sh"},
|
||||
Args: []string{
|
||||
"-c",
|
||||
"cp -RL /mnt/ssh/* /mnt/home-ssh && chmod 700 /mnt/home-ssh && chmod 600 /mnt/home-ssh/* && chown 1000 -R /mnt/home-ssh",
|
||||
},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{Name: "ssh-auth", MountPath: "/mnt/ssh"},
|
||||
{Name: "ssh-home", MountPath: "/mnt/home-ssh"},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []corev1.Volume{
|
||||
{
|
||||
Name: "ssh-auth",
|
||||
|
|
@ -1328,12 +1247,6 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ssh-home",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
@ -1344,9 +1257,7 @@ func TestNewLauncherAndWorker(t *testing.T) {
|
|||
t.Run(name, func(t *testing.T) {
|
||||
job := tc.job.DeepCopy()
|
||||
scheme.Scheme.Default(job)
|
||||
ctrl := &MPIJobController{
|
||||
scriptingImage: scriptingImage,
|
||||
}
|
||||
ctrl := &MPIJobController{}
|
||||
launcher := ctrl.newLauncherJob(job)
|
||||
if !metav1.IsControlledBy(launcher, job) {
|
||||
t.Errorf("Created launcher Pod is not controlled by Job")
|
||||
|
|
@ -1407,8 +1318,7 @@ func (f *fixture) newFakeMPIJobController() *MPIJobController {
|
|||
|
||||
k8sI := kubeinformers.NewSharedInformerFactory(kubeClient, noResyncPeriodFunc())
|
||||
return &MPIJobController{
|
||||
recorder: &record.FakeRecorder{},
|
||||
podLister: k8sI.Core().V1().Pods().Lister(),
|
||||
scriptingImage: scriptingImage,
|
||||
recorder: &record.FakeRecorder{},
|
||||
podLister: k8sI.Core().V1().Pods().Lister(),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ const (
|
|||
envTestMPIOperatorImage = "TEST_MPI_OPERATOR_IMAGE"
|
||||
envTestKindImage = "TEST_KIND_IMAGE"
|
||||
|
||||
defaultMPIOperatorImage = "kubeflow/mpi-operator:local"
|
||||
defaultMPIOperatorImage = "mpioperator/mpi-operator:local"
|
||||
defaultKindImage = "kindest/node:v1.21.2"
|
||||
openMPIImage = "mpioperator/mpi-pi:openmpi"
|
||||
intelMPIImage = "mpioperator/mpi-pi:intel"
|
||||
|
|
|
|||
|
|
@ -40,8 +40,7 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
waitInterval = 100 * time.Millisecond
|
||||
scriptingImage = "alpine"
|
||||
waitInterval = 100 * time.Millisecond
|
||||
)
|
||||
|
||||
func TestMPIJobSuccess(t *testing.T) {
|
||||
|
|
@ -308,8 +307,7 @@ func startController(ctx context.Context, kClient kubernetes.Interface, mpiClien
|
|||
kubeInformerFactory.Core().V1().Pods(),
|
||||
nil,
|
||||
mpiInformerFactory.Kubeflow().V2beta1().MPIJobs(),
|
||||
"",
|
||||
scriptingImage)
|
||||
"")
|
||||
|
||||
go kubeInformerFactory.Start(ctx.Done())
|
||||
go mpiInformerFactory.Start(ctx.Done())
|
||||
|
|
|
|||
Loading…
Reference in New Issue