add support for using Intel MPI(2019.7) and MVAPICH2 (#283)

* + support for IntelMPI and MPICH + local minikube test pass + add new Spec "mpiDistribution" @ 2020/7/27 * * fix ineffectual assignment * change email address * * update variable name * * fix some spelling and naming problems * + add more notes * + auto filter prefix parameters * * fix some spelling problem * update notes about hostfile generating * + mpich-format hostfile split * + generate hosts for hydra to resolve hostname * * update notes * * fix sh script + move hosts sending and merging here * use special type instaed of string * * check return value * * update options' name * + add unit test for generateHosts * ^ fixed lint reported errors
2020-08-03 19:27:40 +08:00 · 2020-08-03 19:27:40 +08:00 · 07bbb45de9
parent 0edc69b150
commit 07bbb45de9
6 changed files with 212 additions and 5 deletions
--- a/cmd/kubectl-delivery/app/server.go
+++ b/cmd/kubectl-delivery/app/server.go
@ -108,6 +108,10 @@ func Run(opt *options.ServerOption) error {
 			continue
 		}
 		lines := strings.SplitN(string(line), " ", 2)
+		// When using Intel MPI or MPICH, the hostfile format is hostname:slots, so need spliting the line by colon.
+		if strings.Contains(lines[0], ":") {
+			lines = strings.SplitN(string(line), ":", 2)
+		}
 		if !strings.HasSuffix(lines[0], launcherPodSuffix) {
 			pods = append(pods, lines[0])
 		}
--- a/pkg/apis/kubeflow/v1alpha2/types.go
+++ b/pkg/apis/kubeflow/v1alpha2/types.go
@ -72,6 +72,11 @@ type MPIJobSpec struct {
 	// active. The policies specified in `RunPolicy` take precedence over
 	// the following fields: `BackoffLimit` and `ActiveDeadlineSeconds`.
 	RunPolicy *common.RunPolicy `json:"runPolicy,omitempty"`
+
+	// MPIDistribution specifies name of the MPI framwork which is used
+	// Defaults to "OpenMPI"
+	// Options includes "OpenMPI", "IntelMPI" and "MPICH"
+	MPIDistribution *MPIDistributionType `json:"mpiDistribution,omitempty"`
 }

 // MPIReplicaType is the type for MPIReplica.
@ -84,3 +89,17 @@ const (
 	// MPIReplicaTypeWorker is the type for worker replicas.
 	MPIReplicaTypeWorker MPIReplicaType = "Worker"
 )
+
+// MPIDistributionType is the type for MPIDistribution.
+type MPIDistributionType string
+
+const(
+	// MPIDistributionTypeOpenMPI is the type for Open MPI.
+	MPIDistributionTypeOpenMPI MPIDistributionType = "OpenMPI"
+
+	// MPIDistributionTypeIntelMPI is the type for Intel MPI.
+	MPIDistributionTypeIntelMPI MPIDistributionType = "IntelMPI"
+
+	// MPIDistributionTypeMPICH is the type for MPICh.
+	MPIDistributionTypeMPICH MPIDistributionType = "MPICH"
+)
--- a/pkg/controllers/kubectl_delivery/controller.go
+++ b/pkg/controllers/kubectl_delivery/controller.go
@ -15,7 +15,9 @@
 package kubectl_delivery

 import (
+	"bufio"
 	"fmt"
+	"os"
 	"sync"
 	"time"

@ -114,7 +116,10 @@ func (c *KubectlDeliveryController) Run(threadiness int, stopCh <-chan struct{})
 	if ok := cache.WaitForCacheSync(stopCh, c.podSynced); !ok {
 		return fmt.Errorf("failed to wait for caches to sync")
 	}
+	// Copy a list of pods to get their ip address
+	var workerPods []string
 	for name := range c.watchedPods {
+		workerPods = append(workerPods, name)
 		pod, err := c.podLister.Pods(c.namespace).Get(name)
 		if err != nil {
 			continue
@ -139,6 +144,9 @@ func (c *KubectlDeliveryController) Run(threadiness int, stopCh <-chan struct{})
 			return nil
 		case <-ticker.C:
 			if len(c.watchedPods) == 0 {
+				if err := c.generateHosts("/etc/hosts", "/opt/kube/hosts", workerPods); err != nil {
+					return fmt.Errorf("Error generating hosts file: %v", err)
+				}
 				klog.Info("Shutting down workers")
 				return nil
 			}
@ -147,6 +155,43 @@ func (c *KubectlDeliveryController) Run(threadiness int, stopCh <-chan struct{})
 	}
 }

+// generateHosts will get and record all workers' ip address in a hosts-format
+// file, which would be sent to each worker pod before launching the remote
+// process manager. It will create and write file to filePath, and will use
+// pod lister to get ip address, so syncing is required before this.
+func (c *KubectlDeliveryController) generateHosts(localHostsPath string, filePath string, workerPods []string) error {
+	var hosts string
+	// First, open local hosts file to read launcher pod ip
+	fd, err := os.Open(localHostsPath)
+	if err != nil {
+		return fmt.Errorf("can't open file[%s]: %v", localHostsPath, err)
+	}
+	defer fd.Close()
+	// Read the last line of hosts file -- the ip address of localhost
+	scanner := bufio.NewScanner(fd)
+	for scanner.Scan() {
+		hosts = scanner.Text()
+	}
+	// Use client-go to find up ip addresses of each node
+	for index := range workerPods {
+		pod, err := c.podLister.Pods(c.namespace).Get(workerPods[index])
+		if err != nil {
+			return fmt.Errorf("can't get IP address of node[%s]", workerPods[index])
+		}
+		hosts = fmt.Sprintf("%s\n%s\t%s", hosts, pod.Status.PodIP, pod.Name)
+	}
+	// Write the hosts-format ip record to volume, and will be sent to worker later.
+	fp, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
+	if err != nil {
+		return fmt.Errorf("can't create file[%s]: %v", filePath, err)
+	}
+	defer fp.Close()
+	if _, err := fp.WriteString(hosts); err != nil {
+		return fmt.Errorf("can't write file[%s]: %v", filePath, err)
+	}
+	return nil
+}
+
 // runWorker is a long-running function that will continually call the
 // processNextWorkItem function in order to read and process a message on the
 // work queue.
--- a/pkg/controllers/kubectl_delivery/controller_test.go
+++ b/pkg/controllers/kubectl_delivery/controller_test.go
@ -15,6 +15,10 @@
 package kubectl_delivery

 import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
 	"testing"

 	corev1 "k8s.io/api/core/v1"
@ -38,3 +42,57 @@ func TestWaitingPodRunning(t *testing.T) {
 	f.setUpPods(fakePod)
 	f.run(namespace, podName)
 }
+
+// Test hosts file generating function
+func TestGeneratingHostsFile(t *testing.T) {
+	namespace := "default"
+	podNames := []string{"test", "tester-2", "worker_3", "pod4"}
+	// content of fake local hosts file
+	content := []byte(`# this line is a comment
+	127.0.0.1       localhost
+	::1							localhost
+	234.98.76.54		uselesshost
+	10.234.56.78		launcher.fullname.test	launcher`)
+
+	// content of excepted outputs
+	exceptedHosts := make(map[string]string)
+	exceptedHosts["launcher"] = "10.234.56.78"
+	exceptedHosts["launcher.fullname.test"] = "10.234.56.78"
+	f := newFixture(t)
+	// add fake worker pods
+	fakePod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: namespace,
+		},
+	}
+	fakePod.Status.Phase = corev1.PodRunning
+	for index := range podNames {
+		fakePod.Status.PodIP = "123.123.123." + strconv.Itoa(index)
+		fakePod.ObjectMeta.Name = podNames[index]
+		exceptedHosts[fakePod.ObjectMeta.Name] = fakePod.Status.PodIP
+		f.setUpPods(fakePod.DeepCopy())
+	}
+	// set up temp directory for testing about files
+	p, tmphf := f.setUpTmpDir("hostsGenerating", content)
+	defer os.RemoveAll(p) // clean up the temp directory
+	tmpof := filepath.Join(p, "output")
+	c, _ := f.newController(namespace, podNames)
+	// generate new hosts file
+	err := c.generateHosts(tmphf, tmpof, podNames)
+	if err != nil {
+		t.Errorf("Error, cannot generating hosts of worker pods, errs: %v", err)
+	}
+	// get the output file content
+	outputContent, err := ioutil.ReadFile(tmpof)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// slice the content of output to avoid space/tab interference
+	generatedHosts := f.getResolvedHosts(outputContent)
+	// check the output
+	for hostname, exceptedIP := range exceptedHosts {
+		if resolvedIP, ok := generatedHosts[hostname]; !ok || resolvedIP != exceptedIP {
+			t.Errorf("Error, generated hosts file incorrect. Host: %s, excepted: %s, resolved: %s", hostname, exceptedIP, resolvedIP)
+		}
+	}
+}
--- a/pkg/controllers/kubectl_delivery/utils_test.go
+++ b/pkg/controllers/kubectl_delivery/utils_test.go
@ -15,8 +15,13 @@
 package kubectl_delivery

 import (
+	"bufio"
 	"fmt"
+	"io/ioutil"
+	"os"
 	"path"
+	"path/filepath"
+	"strings"
 	"testing"
 	"time"

@ -107,3 +112,44 @@ func (f *fixture) setUpPods(p *corev1.Pod) {
 	f.podLister = append(f.podLister, p)
 	f.kubeObjects = append(f.kubeObjects, p)
 }
+
+// getResolvedHosts will resolve the hosts file to a map object,
+// with the hostname as key and IP address as value
+func (f *fixture) getResolvedHosts(contentBytes []byte) map[string]string {
+	// create a scanner to read content line by line
+	hostRecords := make(map[string]string)
+	contentStrReader := strings.NewReader(string(contentBytes))
+	scanner := bufio.NewScanner(contentStrReader)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if line[0] == '#' { // skip the comment line
+			continue
+		}
+		lines := strings.Fields(line)
+		if len(lines) == 0 { // skip the space line
+			continue
+		}
+		if len(lines) == 1 { // the format must has some mistakes
+			f.t.Error("Error, generated hosts file has wrong format.")
+			continue
+		}
+		for i := 1; i < len(lines); i++ {
+			hostRecords[lines[i]] = lines[0] // use map to record hosts
+		}
+	}
+	return hostRecords
+}
+
+// setUpTmpDir will create a temp directory and create a temp hosts file
+// with provided file content, and return the path of the directory.
+func (f *fixture) setUpTmpDir(dirName string, content []byte) (string, string) {
+	p, err := ioutil.TempDir(os.TempDir(), "hosts")
+	if err != nil {
+		f.t.Fatal(err)
+	}
+	tmphf := filepath.Join(p, "hosts")
+	if err := ioutil.WriteFile(tmphf, content, 0644); err != nil {
+		f.t.Fatal(err)
+	}
+	return p, tmphf
+}
--- a/pkg/controllers/v1alpha2/mpi_job_controller.go
+++ b/pkg/controllers/v1alpha2/mpi_job_controller.go
@ -958,15 +958,27 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
 // resource. It also sets the appropriate OwnerReferences on the resource so
 // handleObject can discover the MPIJob resource that 'owns' it.
 func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigMap {
+	// This part closely related to specific ssh commands.
+	// It is very likely to fail due to the version change of the MPI framework.
+	// Attempt to automatically filter prefix parameters by detecting "-" matches.
+	// In order to enable IntelMPI and MVAPICH2 to parse pod names, in the Init container, 
+	// a hosts file containing all workers is generated based on the pods list.
+	// Will use kubectl to send it to the workers and append it to the end of the original hosts file.
 	kubexec := fmt.Sprintf(`#!/bin/sh
 set -x
 POD_NAME=$1
+while [ ${POD_NAME%%${POD_NAME#?}} = "-" ]
+do
 shift
-%s/kubectl exec ${POD_NAME}`, kubectlMountPath)
+POD_NAME=$1
+done
+shift
+%s/kubectl cp %s/hosts ${POD_NAME}:/etc/hosts_of_nodes
+%s/kubectl exec ${POD_NAME}`, kubectlMountPath, kubectlMountPath, kubectlMountPath)
 	if len(mpiJob.Spec.MainContainer) > 0 {
 		kubexec = fmt.Sprintf("%s --container %s", kubexec, mpiJob.Spec.MainContainer)
 	}
-	kubexec = fmt.Sprintf("%s -- /bin/sh -c \"$*\"", kubexec)
+	kubexec = fmt.Sprintf("%s -- /bin/sh -c \"cat /etc/hosts_of_nodes >> /etc/hosts && $*\"", kubexec)

 	// If no processing unit is specified, default to 1 slot.
 	slots := 1
@ -974,9 +986,17 @@ shift
 		slots = int(*mpiJob.Spec.SlotsPerWorker)
 	}
 	var buffer bytes.Buffer
+	// For the different MPI frameworks, the format of the hostfile file is inconsistent.
+	// For Intel MPI and MVAPICH2, use ":" syntax to indicate how many operating slots the current node has.
+	// But for Open MPI, use "slots=" syntax to achieve this function.
 	for i := 0; i < int(workerReplicas); i++ {
+		mpiDistribution := mpiJob.Spec.MPIDistribution
+		if mpiDistribution != nil && (*mpiDistribution == kubeflow.MPIDistributionTypeIntelMPI || *mpiDistribution == kubeflow.MPIDistributionTypeMPICH) {
+			buffer.WriteString(fmt.Sprintf("%s%s-%d:%d\n", mpiJob.Name, workerSuffix, i, slots))
+		} else {
 			buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d\n", mpiJob.Name, workerSuffix, i, slots))
 		}
+	}

 	return &corev1.ConfigMap{
 		ObjectMeta: metav1.ObjectMeta{
@ -1278,13 +1298,28 @@ func (c *MPIJobController) newLauncher(mpiJob *kubeflow.MPIJob, kubectlDeliveryI
 		return nil
 	}
 	container := podSpec.Spec.Containers[0]
+	// Different MPI frameworks use different environment variables
+	// to specify the path of the remote task launcher and hostfile file.
+	mpiRshExecPathEnvName := "OMPI_MCA_plm_rsh_agent"
+	mpiHostfilePathEnvName := "OMPI_MCA_orte_default_hostfile"
+	// If the MPIDistribution is not specificed as the "IntelMPI" or "MPICH",
+	// then think that the default "OpenMPI" will be used.
+	if mpiJob.Spec.MPIDistribution != nil {
+		if *mpiJob.Spec.MPIDistribution == kubeflow.MPIDistributionTypeIntelMPI {
+			mpiRshExecPathEnvName = "I_MPI_HYDRA_BOOTSTRAP_EXEC"
+			mpiHostfilePathEnvName = "I_MPI_HYDRA_HOST_FILE"
+		} else if *mpiJob.Spec.MPIDistribution == kubeflow.MPIDistributionTypeMPICH {
+			mpiRshExecPathEnvName = "HYDRA_LAUNCHER_EXEC"
+			mpiHostfilePathEnvName = "HYDRA_HOST_FILE"
+		}
+	}
 	container.Env = append(container.Env,
 		corev1.EnvVar{
-			Name:  "OMPI_MCA_plm_rsh_agent",
+			Name:  mpiRshExecPathEnvName,
 			Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
 		},
 		corev1.EnvVar{
-			Name:  "OMPI_MCA_orte_default_hostfile",
+			Name:  mpiHostfilePathEnvName,
 			Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
 		},
 		// We overwrite these environment variables so that users will not