From 9994bd02db6fb6dff76b09513d34e6794d3bd74c Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 30 May 2018 17:02:15 -0700 Subject: [PATCH] Initial code push --- .dockerignore | 1 + .gitignore | 2 + Gopkg.lock | 471 +++++++++ Gopkg.toml | 13 + LICENSE | 202 ++++ README.md | 54 +- cmd/kubectl-delivery/Dockerfile | 17 + cmd/kubectl-delivery/deliver_kubectl.sh | 24 + cmd/mpi-operator/Dockerfile | 24 + cmd/mpi-operator/main.go | 89 ++ deploy/0-crd.yaml | 15 + deploy/1-namespace.yaml | 4 + deploy/2-rbac.yaml | 93 ++ deploy/3-mpi-operator.yaml | 27 + examples/tensorflow-benchmarks.yaml | 12 + examples/tensorflow-benchmarks/Dockerfile | 12 + hack/custom-boilerplate.go.txt | 13 + hack/update-codegen.sh | 26 + hack/verify-codegen.sh | 48 + pkg/apis/kubeflow/v1alpha1/doc.go | 17 + pkg/apis/kubeflow/v1alpha1/register.go | 49 + pkg/apis/kubeflow/v1alpha1/types.go | 69 ++ .../v1alpha1/zz_generated.deepcopy.go | 126 +++ pkg/client/clientset/versioned/clientset.go | 96 ++ pkg/client/clientset/versioned/doc.go | 18 + .../versioned/fake/clientset_generated.go | 80 ++ pkg/client/clientset/versioned/fake/doc.go | 18 + .../clientset/versioned/fake/register.go | 52 + pkg/client/clientset/versioned/scheme/doc.go | 18 + .../clientset/versioned/scheme/register.go | 52 + .../versioned/typed/kubeflow/v1alpha1/doc.go | 18 + .../typed/kubeflow/v1alpha1/fake/doc.go | 18 + .../v1alpha1/fake/fake_kubeflow_client.go | 38 + .../kubeflow/v1alpha1/fake/fake_mpijob.go | 138 +++ .../kubeflow/v1alpha1/generated_expansion.go | 19 + .../kubeflow/v1alpha1/kubeflow_client.go | 88 ++ .../typed/kubeflow/v1alpha1/mpijob.go | 172 +++ .../informers/externalversions/factory.go | 178 ++++ .../informers/externalversions/generic.go | 60 ++ .../internalinterfaces/factory_interfaces.go | 36 + .../externalversions/kubeflow/interface.go | 44 + .../kubeflow/v1alpha1/interface.go | 43 + .../kubeflow/v1alpha1/mpijob.go | 87 ++ .../kubeflow/v1alpha1/expansion_generated.go | 25 + .../listers/kubeflow/v1alpha1/mpijob.go | 92 ++ pkg/controllers/mpi_job_controller.go | 997 ++++++++++++++++++ pkg/controllers/mpi_job_controller_test.go | 648 ++++++++++++ 47 files changed, 4441 insertions(+), 2 deletions(-) create mode 120000 .dockerignore create mode 100644 .gitignore create mode 100644 Gopkg.lock create mode 100644 Gopkg.toml create mode 100644 LICENSE create mode 100644 cmd/kubectl-delivery/Dockerfile create mode 100755 cmd/kubectl-delivery/deliver_kubectl.sh create mode 100644 cmd/mpi-operator/Dockerfile create mode 100644 cmd/mpi-operator/main.go create mode 100644 deploy/0-crd.yaml create mode 100644 deploy/1-namespace.yaml create mode 100644 deploy/2-rbac.yaml create mode 100644 deploy/3-mpi-operator.yaml create mode 100644 examples/tensorflow-benchmarks.yaml create mode 100644 examples/tensorflow-benchmarks/Dockerfile create mode 100644 hack/custom-boilerplate.go.txt create mode 100755 hack/update-codegen.sh create mode 100755 hack/verify-codegen.sh create mode 100644 pkg/apis/kubeflow/v1alpha1/doc.go create mode 100644 pkg/apis/kubeflow/v1alpha1/register.go create mode 100644 pkg/apis/kubeflow/v1alpha1/types.go create mode 100644 pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go create mode 100644 pkg/client/clientset/versioned/clientset.go create mode 100644 pkg/client/clientset/versioned/doc.go create mode 100644 pkg/client/clientset/versioned/fake/clientset_generated.go create mode 100644 pkg/client/clientset/versioned/fake/doc.go create mode 100644 pkg/client/clientset/versioned/fake/register.go create mode 100644 pkg/client/clientset/versioned/scheme/doc.go create mode 100644 pkg/client/clientset/versioned/scheme/register.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_mpijob.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go create mode 100644 pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/mpijob.go create mode 100644 pkg/client/informers/externalversions/factory.go create mode 100644 pkg/client/informers/externalversions/generic.go create mode 100644 pkg/client/informers/externalversions/internalinterfaces/factory_interfaces.go create mode 100644 pkg/client/informers/externalversions/kubeflow/interface.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go create mode 100644 pkg/client/informers/externalversions/kubeflow/v1alpha1/mpijob.go create mode 100644 pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go create mode 100644 pkg/client/listers/kubeflow/v1alpha1/mpijob.go create mode 100644 pkg/controllers/mpi_job_controller.go create mode 100644 pkg/controllers/mpi_job_controller_test.go diff --git a/.dockerignore b/.dockerignore new file mode 120000 index 0000000..3e4e48b --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.gitignore \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c36e38 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +vendor/ diff --git a/Gopkg.lock b/Gopkg.lock new file mode 100644 index 0000000..a7642fb --- /dev/null +++ b/Gopkg.lock @@ -0,0 +1,471 @@ +# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. + + +[[projects]] + name = "github.com/davecgh/go-spew" + packages = ["spew"] + revision = "346938d642f2ec3594ed81d874461961cd0faa76" + version = "v1.1.0" + +[[projects]] + name = "github.com/ghodss/yaml" + packages = ["."] + revision = "0ca9ea5df5451ffdf184b4428c902747c2c11cd7" + version = "v1.0.0" + +[[projects]] + name = "github.com/gogo/protobuf" + packages = [ + "proto", + "sortkeys" + ] + revision = "1adfc126b41513cc696b209667c8656ea7aac67c" + version = "v1.0.0" + +[[projects]] + branch = "master" + name = "github.com/golang/glog" + packages = ["."] + revision = "23def4e6c14b4da8ac2ed8007337bc5eb5007998" + +[[projects]] + branch = "master" + name = "github.com/golang/groupcache" + packages = ["lru"] + revision = "24b0969c4cb722950103eed87108c8d291a8df00" + +[[projects]] + name = "github.com/golang/protobuf" + packages = [ + "proto", + "ptypes", + "ptypes/any", + "ptypes/duration", + "ptypes/timestamp" + ] + revision = "b4deda0973fb4c70b50d226b1af49f3da59f5265" + version = "v1.1.0" + +[[projects]] + branch = "master" + name = "github.com/google/gofuzz" + packages = ["."] + revision = "24818f796faf91cd76ec7bddd72458fbced7a6c1" + +[[projects]] + name = "github.com/googleapis/gnostic" + packages = [ + "OpenAPIv2", + "compiler", + "extensions" + ] + revision = "7c663266750e7d82587642f65e60bc4083f1f84e" + version = "v0.2.0" + +[[projects]] + branch = "master" + name = "github.com/hashicorp/golang-lru" + packages = [ + ".", + "simplelru" + ] + revision = "0fb14efe8c47ae851c0034ed7a448854d3d34cf3" + +[[projects]] + branch = "master" + name = "github.com/howeyc/gopass" + packages = ["."] + revision = "bf9dde6d0d2c004a008c27aaee91170c786f6db8" + +[[projects]] + name = "github.com/imdario/mergo" + packages = ["."] + revision = "9d5f1277e9a8ed20c3684bda8fde67c05628518c" + version = "v0.3.4" + +[[projects]] + name = "github.com/json-iterator/go" + packages = ["."] + revision = "ca39e5af3ece67bbcda3d0f4f56a8e24d9f2dad4" + version = "1.1.3" + +[[projects]] + name = "github.com/modern-go/concurrent" + packages = ["."] + revision = "bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94" + version = "1.0.3" + +[[projects]] + name = "github.com/modern-go/reflect2" + packages = ["."] + revision = "1df9eeb2bb81f327b96228865c5687bc2194af3f" + version = "1.0.0" + +[[projects]] + name = "github.com/spf13/pflag" + packages = ["."] + revision = "583c0c0531f06d5278b7d917446061adc344b5cd" + version = "v1.0.1" + +[[projects]] + branch = "master" + name = "golang.org/x/crypto" + packages = ["ssh/terminal"] + revision = "5ba7f63082460102a45837dbd1827e10f9479ac0" + +[[projects]] + branch = "master" + name = "golang.org/x/net" + packages = [ + "context", + "http/httpguts", + "http2", + "http2/hpack", + "idna" + ] + revision = "1e491301e022f8f977054da4c2d852decd59571f" + +[[projects]] + branch = "master" + name = "golang.org/x/sys" + packages = [ + "unix", + "windows" + ] + revision = "c11f84a56e43e20a78cee75a7c034031ecf57d1f" + +[[projects]] + name = "golang.org/x/text" + packages = [ + "collate", + "collate/build", + "internal/colltab", + "internal/gen", + "internal/tag", + "internal/triegen", + "internal/ucd", + "language", + "secure/bidirule", + "transform", + "unicode/bidi", + "unicode/cldr", + "unicode/norm", + "unicode/rangetable" + ] + revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0" + version = "v0.3.0" + +[[projects]] + branch = "master" + name = "golang.org/x/time" + packages = ["rate"] + revision = "fbb02b2291d28baffd63558aa44b4b56f178d650" + +[[projects]] + branch = "master" + name = "golang.org/x/tools" + packages = [ + "go/ast/astutil", + "imports", + "internal/fastwalk" + ] + revision = "a5b4c53f6e8bdcafa95a94671bf2d1203365858b" + +[[projects]] + name = "gopkg.in/inf.v0" + packages = ["."] + revision = "d2d2541c53f18d2a059457998ce2876cc8e67cbf" + version = "v0.9.1" + +[[projects]] + name = "gopkg.in/yaml.v2" + packages = ["."] + revision = "5420a8b6744d3b0345ab293f6fcba19c978f1183" + version = "v2.2.1" + +[[projects]] + name = "k8s.io/api" + packages = [ + "admissionregistration/v1alpha1", + "admissionregistration/v1beta1", + "apps/v1", + "apps/v1beta1", + "apps/v1beta2", + "authentication/v1", + "authentication/v1beta1", + "authorization/v1", + "authorization/v1beta1", + "autoscaling/v1", + "autoscaling/v2beta1", + "batch/v1", + "batch/v1beta1", + "batch/v2alpha1", + "certificates/v1beta1", + "core/v1", + "events/v1beta1", + "extensions/v1beta1", + "networking/v1", + "policy/v1beta1", + "rbac/v1", + "rbac/v1alpha1", + "rbac/v1beta1", + "scheduling/v1alpha1", + "settings/v1alpha1", + "storage/v1", + "storage/v1alpha1", + "storage/v1beta1" + ] + revision = "feb48db456a5912850dcccbd42a3535382ba76de" + version = "kubernetes-1.10.3" + +[[projects]] + branch = "release-1.10" + name = "k8s.io/apimachinery" + packages = [ + "pkg/api/errors", + "pkg/api/meta", + "pkg/api/resource", + "pkg/apis/meta/internalversion", + "pkg/apis/meta/v1", + "pkg/apis/meta/v1/unstructured", + "pkg/apis/meta/v1beta1", + "pkg/conversion", + "pkg/conversion/queryparams", + "pkg/fields", + "pkg/labels", + "pkg/runtime", + "pkg/runtime/schema", + "pkg/runtime/serializer", + "pkg/runtime/serializer/json", + "pkg/runtime/serializer/protobuf", + "pkg/runtime/serializer/recognizer", + "pkg/runtime/serializer/streaming", + "pkg/runtime/serializer/versioning", + "pkg/selection", + "pkg/types", + "pkg/util/cache", + "pkg/util/clock", + "pkg/util/diff", + "pkg/util/errors", + "pkg/util/framer", + "pkg/util/intstr", + "pkg/util/json", + "pkg/util/mergepatch", + "pkg/util/net", + "pkg/util/runtime", + "pkg/util/sets", + "pkg/util/strategicpatch", + "pkg/util/validation", + "pkg/util/validation/field", + "pkg/util/wait", + "pkg/util/yaml", + "pkg/version", + "pkg/watch", + "third_party/forked/golang/json", + "third_party/forked/golang/reflect" + ] + revision = "31dade610c053669d8054bfd847da657251e8c1a" + +[[projects]] + name = "k8s.io/client-go" + packages = [ + "discovery", + "discovery/fake", + "informers", + "informers/admissionregistration", + "informers/admissionregistration/v1alpha1", + "informers/admissionregistration/v1beta1", + "informers/apps", + "informers/apps/v1", + "informers/apps/v1beta1", + "informers/apps/v1beta2", + "informers/autoscaling", + "informers/autoscaling/v1", + "informers/autoscaling/v2beta1", + "informers/batch", + "informers/batch/v1", + "informers/batch/v1beta1", + "informers/batch/v2alpha1", + "informers/certificates", + "informers/certificates/v1beta1", + "informers/core", + "informers/core/v1", + "informers/events", + "informers/events/v1beta1", + "informers/extensions", + "informers/extensions/v1beta1", + "informers/internalinterfaces", + "informers/networking", + "informers/networking/v1", + "informers/policy", + "informers/policy/v1beta1", + "informers/rbac", + "informers/rbac/v1", + "informers/rbac/v1alpha1", + "informers/rbac/v1beta1", + "informers/scheduling", + "informers/scheduling/v1alpha1", + "informers/settings", + "informers/settings/v1alpha1", + "informers/storage", + "informers/storage/v1", + "informers/storage/v1alpha1", + "informers/storage/v1beta1", + "kubernetes", + "kubernetes/fake", + "kubernetes/scheme", + "kubernetes/typed/admissionregistration/v1alpha1", + "kubernetes/typed/admissionregistration/v1alpha1/fake", + "kubernetes/typed/admissionregistration/v1beta1", + "kubernetes/typed/admissionregistration/v1beta1/fake", + "kubernetes/typed/apps/v1", + "kubernetes/typed/apps/v1/fake", + "kubernetes/typed/apps/v1beta1", + "kubernetes/typed/apps/v1beta1/fake", + "kubernetes/typed/apps/v1beta2", + "kubernetes/typed/apps/v1beta2/fake", + "kubernetes/typed/authentication/v1", + "kubernetes/typed/authentication/v1/fake", + "kubernetes/typed/authentication/v1beta1", + "kubernetes/typed/authentication/v1beta1/fake", + "kubernetes/typed/authorization/v1", + "kubernetes/typed/authorization/v1/fake", + "kubernetes/typed/authorization/v1beta1", + "kubernetes/typed/authorization/v1beta1/fake", + "kubernetes/typed/autoscaling/v1", + "kubernetes/typed/autoscaling/v1/fake", + "kubernetes/typed/autoscaling/v2beta1", + "kubernetes/typed/autoscaling/v2beta1/fake", + "kubernetes/typed/batch/v1", + "kubernetes/typed/batch/v1/fake", + "kubernetes/typed/batch/v1beta1", + "kubernetes/typed/batch/v1beta1/fake", + "kubernetes/typed/batch/v2alpha1", + "kubernetes/typed/batch/v2alpha1/fake", + "kubernetes/typed/certificates/v1beta1", + "kubernetes/typed/certificates/v1beta1/fake", + "kubernetes/typed/core/v1", + "kubernetes/typed/core/v1/fake", + "kubernetes/typed/events/v1beta1", + "kubernetes/typed/events/v1beta1/fake", + "kubernetes/typed/extensions/v1beta1", + "kubernetes/typed/extensions/v1beta1/fake", + "kubernetes/typed/networking/v1", + "kubernetes/typed/networking/v1/fake", + "kubernetes/typed/policy/v1beta1", + "kubernetes/typed/policy/v1beta1/fake", + "kubernetes/typed/rbac/v1", + "kubernetes/typed/rbac/v1/fake", + "kubernetes/typed/rbac/v1alpha1", + "kubernetes/typed/rbac/v1alpha1/fake", + "kubernetes/typed/rbac/v1beta1", + "kubernetes/typed/rbac/v1beta1/fake", + "kubernetes/typed/scheduling/v1alpha1", + "kubernetes/typed/scheduling/v1alpha1/fake", + "kubernetes/typed/settings/v1alpha1", + "kubernetes/typed/settings/v1alpha1/fake", + "kubernetes/typed/storage/v1", + "kubernetes/typed/storage/v1/fake", + "kubernetes/typed/storage/v1alpha1", + "kubernetes/typed/storage/v1alpha1/fake", + "kubernetes/typed/storage/v1beta1", + "kubernetes/typed/storage/v1beta1/fake", + "listers/admissionregistration/v1alpha1", + "listers/admissionregistration/v1beta1", + "listers/apps/v1", + "listers/apps/v1beta1", + "listers/apps/v1beta2", + "listers/autoscaling/v1", + "listers/autoscaling/v2beta1", + "listers/batch/v1", + "listers/batch/v1beta1", + "listers/batch/v2alpha1", + "listers/certificates/v1beta1", + "listers/core/v1", + "listers/events/v1beta1", + "listers/extensions/v1beta1", + "listers/networking/v1", + "listers/policy/v1beta1", + "listers/rbac/v1", + "listers/rbac/v1alpha1", + "listers/rbac/v1beta1", + "listers/scheduling/v1alpha1", + "listers/settings/v1alpha1", + "listers/storage/v1", + "listers/storage/v1alpha1", + "listers/storage/v1beta1", + "pkg/apis/clientauthentication", + "pkg/apis/clientauthentication/v1alpha1", + "pkg/version", + "plugin/pkg/client/auth/exec", + "rest", + "rest/watch", + "testing", + "tools/auth", + "tools/cache", + "tools/clientcmd", + "tools/clientcmd/api", + "tools/clientcmd/api/latest", + "tools/clientcmd/api/v1", + "tools/metrics", + "tools/pager", + "tools/record", + "tools/reference", + "transport", + "util/buffer", + "util/cert", + "util/flowcontrol", + "util/homedir", + "util/integer", + "util/retry", + "util/workqueue" + ] + revision = "29ae1f00c3d8bb759d6246c357573a9af3c659c1" + version = "kubernetes-1.10.3" + +[[projects]] + branch = "master" + name = "k8s.io/code-generator" + packages = [ + "cmd/client-gen", + "cmd/client-gen/args", + "cmd/client-gen/generators", + "cmd/client-gen/generators/fake", + "cmd/client-gen/generators/scheme", + "cmd/client-gen/generators/util", + "cmd/client-gen/path", + "cmd/client-gen/types", + "pkg/util" + ] + revision = "2381612e86473457f7e1b8f7edf16cf1e191d859" + +[[projects]] + branch = "master" + name = "k8s.io/gengo" + packages = [ + "args", + "generator", + "namer", + "parser", + "types" + ] + revision = "2e1a79edcaecf0bfbde129a1fd55624b66adb699" + +[[projects]] + branch = "master" + name = "k8s.io/kube-openapi" + packages = ["pkg/util/proto"] + revision = "8a9b82f00b3a86eac24681da3f9fe6c34c01cea2" + +[[projects]] + branch = "master" + name = "k8s.io/sample-controller" + packages = ["pkg/signals"] + revision = "9946af3e3014758bed13e404bcd95ed27a4e37c2" + +[solve-meta] + analyzer-name = "dep" + analyzer-version = 1 + inputs-digest = "80b42d5ea1ce943bcf8a5b5cdfd29034113dfcb280253423c82a5a5afbcef951" + solver-name = "gps-cdcl" + solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml new file mode 100644 index 0000000..2fae35f --- /dev/null +++ b/Gopkg.toml @@ -0,0 +1,13 @@ +required = ["k8s.io/code-generator/cmd/client-gen"] + +[[override]] + name = "k8s.io/api" + version = "kubernetes-1.10.3" + +[[override]] + name = "k8s.io/apimachinery" + version = "kubernetes-1.10.3" + +[[override]] + name = "k8s.io/client-go" + version = "kubernetes-1.10.3" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index c1050fe..3a53504 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,52 @@ -# mpi-operator -Repository for the MPI operator. +# MPI Operator + +The MPI Operator makes it easy to run allreduce-style distributed training. + +## Build + +Check out the code: +```shell +mkdir -p ${GOPATH}/src/github.com/kubeflow +cd ${GOPATH}/src/github.com/kubeflow +git clone https://github.com/kubeflow/mpi-operator.git +cd mpi-operator +``` + +Build and push the `mpi-operator` Docker image: +```shell +docker built -t rongou/mpi-operator:0.1.0 -f cmd/mpi-operator/Dockerfile . +docker push rongou/mpi-operator:0.1.0 +``` + +Build and push the `kubectl-delivery` Docker image: +```shell +docker build -t rongou/kubectl-delivery:0.1.0 -f cmd/kubectl-delivery/Dockerfile . +docker push rongou/mpi-operator:0.1.0 +``` + +## Deploy + +```shell +kubectl create -f deploy/ +``` + +## Test + +Build and push the `horovod` Docker image (this takes a while): +```shell +docker build -t rongou/horovod https://github.com/uber/horovod.git +docker push rongou/horovod +``` + +Build and push the `tensorflow_benchmarks` Docker image: +```shell +docker build -t rongou/tensorflow_benchmarks examples/tensorflow-benchmarks +docker push rongou/tensorflow_benchmarks +``` + +Launch a multi-node tensorflow benchmark training job: +```shell +kubectl create -f examples/tensorflow-benchmarks.yaml +``` + +Once everything starts, the logs are available in the `launcher` pod. diff --git a/cmd/kubectl-delivery/Dockerfile b/cmd/kubectl-delivery/Dockerfile new file mode 100644 index 0000000..eb1d915 --- /dev/null +++ b/cmd/kubectl-delivery/Dockerfile @@ -0,0 +1,17 @@ +FROM alpine:3.7 AS build + +# Install kubectl. +ENV K8S_VERSION v1.10.3 +RUN apk add --no-cache wget +RUN wget -q https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl +RUN chmod +x ./kubectl +RUN mv ./kubectl /bin/kubectl + +# Copy all project. +# This layer is rebuilt when ever a file has changed in the project directory. +COPY . /go/src/github.com/kubeflow/mpi-operator/ + +FROM alpine:3.7 +COPY --from=build /bin/kubectl /bin/kubectl +COPY --from=build /go/src/github.com/kubeflow/mpi-operator/cmd/kubectl-delivery/deliver_kubectl.sh . +ENTRYPOINT ["./deliver_kubectl.sh"] diff --git a/cmd/kubectl-delivery/deliver_kubectl.sh b/cmd/kubectl-delivery/deliver_kubectl.sh new file mode 100755 index 0000000..5cf3e56 --- /dev/null +++ b/cmd/kubectl-delivery/deliver_kubectl.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +# Copyright 2018 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if ! which kubectl > /dev/null; then + echo "kubectl needs to be installed" + exit 1 +fi + +: ${TARGET_DIR:?"Need to set TARGET_DIR, e.g. /opt/kube"} + +cp $(which kubectl) ${TARGET_DIR} diff --git a/cmd/mpi-operator/Dockerfile b/cmd/mpi-operator/Dockerfile new file mode 100644 index 0000000..9672a92 --- /dev/null +++ b/cmd/mpi-operator/Dockerfile @@ -0,0 +1,24 @@ +FROM golang:1.10.2-alpine3.7 AS build + +# Install tools required to build the project. +# We need to run `docker build --no-cache .` to update those dependencies. +RUN apk add --no-cache git +RUN go get github.com/golang/dep/cmd/dep + +# Gopkg.toml and Gopkg.lock lists project dependencies. +# These layers are only re-built when Gopkg files are updated. +COPY Gopkg.lock Gopkg.toml /go/src/github.com/kubeflow/mpi-operator/ +WORKDIR /go/src/github.com/kubeflow/mpi-operator/ + +# Install library dependencies. +RUN dep ensure -vendor-only + +# Copy all project and build it. +# This layer is rebuilt when ever a file has changed in the project directory. +COPY . /go/src/github.com/kubeflow/mpi-operator/ +RUN go build -o /bin/mpi-operator github.com/kubeflow/mpi-operator/cmd/mpi-operator + +FROM alpine:3.7 +COPY --from=build /bin/mpi-operator /bin/mpi-operator +ENTRYPOINT ["/bin/mpi-operator"] +CMD ["--help"] diff --git a/cmd/mpi-operator/main.go b/cmd/mpi-operator/main.go new file mode 100644 index 0000000..5d21b50 --- /dev/null +++ b/cmd/mpi-operator/main.go @@ -0,0 +1,89 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "time" + + "github.com/golang/glog" + kubeinformers "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/sample-controller/pkg/signals" + + clientset "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + informers "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions" + "github.com/kubeflow/mpi-operator/pkg/controllers" +) + +var ( + masterURL string + kubeConfig string + gpusPerNode int + kubectlDeliveryImage string +) + +func main() { + flag.Parse() + + // set up signals so we handle the first shutdown signal gracefully + stopCh := signals.SetupSignalHandler() + + cfg, err := clientcmd.BuildConfigFromFlags(masterURL, kubeConfig) + if err != nil { + glog.Fatalf("Error building kubeConfig: %s", err.Error()) + } + + kubeClient, err := kubernetes.NewForConfig(cfg) + if err != nil { + glog.Fatalf("Error building kubernetes clientset: %s", err.Error()) + } + + kubeflowClient, err := clientset.NewForConfig(cfg) + if err != nil { + glog.Fatalf("Error building kubeflow clientset: %s", err.Error()) + } + + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClient, time.Second*30) + kubeflowInformerFactory := informers.NewSharedInformerFactory(kubeflowClient, time.Second*30) + + controller := controllers.NewMPIJobController( + kubeClient, + kubeflowClient, + kubeInformerFactory.Core().V1().ConfigMaps(), + kubeInformerFactory.Core().V1().ServiceAccounts(), + kubeInformerFactory.Rbac().V1().Roles(), + kubeInformerFactory.Rbac().V1().RoleBindings(), + kubeInformerFactory.Apps().V1().StatefulSets(), + kubeInformerFactory.Batch().V1().Jobs(), + kubeflowInformerFactory.Kubeflow().V1alpha1().MPIJobs(), + gpusPerNode, + kubectlDeliveryImage) + + go kubeInformerFactory.Start(stopCh) + go kubeflowInformerFactory.Start(stopCh) + + if err = controller.Run(2, stopCh); err != nil { + glog.Fatalf("Error running controller: %s", err.Error()) + } +} + +func init() { + flag.StringVar(&kubeConfig, "kubeConfig", "", "Path to a kubeConfig. Only required if out-of-cluster.") + flag.StringVar(&masterURL, "master", "", "The address of the Kubernetes API server. Overrides any value in kubeConfig. Only required if out-of-cluster.") + flag.IntVar(&gpusPerNode, "gpus-per-node", 1, "The maximum number of GPUs available per node.") + flag.StringVar(&kubectlDeliveryImage, "kubectl-delivery-image", "", "The container image used to deliver the kubectl binary.") +} diff --git a/deploy/0-crd.yaml b/deploy/0-crd.yaml new file mode 100644 index 0000000..fc42c83 --- /dev/null +++ b/deploy/0-crd.yaml @@ -0,0 +1,15 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: mpijobs.kubeflow.org +spec: + group: kubeflow.org + version: v1alpha1 + scope: Namespaced + names: + plural: mpijobs + singular: mpijob + kind: MPIJob + shortNames: + - mj + - mpij diff --git a/deploy/1-namespace.yaml b/deploy/1-namespace.yaml new file mode 100644 index 0000000..2f25b59 --- /dev/null +++ b/deploy/1-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: mpi-operator diff --git a/deploy/2-rbac.yaml b/deploy/2-rbac.yaml new file mode 100644 index 0000000..2e106f6 --- /dev/null +++ b/deploy/2-rbac.yaml @@ -0,0 +1,93 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: mpi-operator +rules: +- apiGroups: + - "" + resources: + - configmaps + - serviceaccounts + verbs: + - create + - list + - watch +# This is needed for the launcher Role. +- apiGroups: + - "" + resources: + - pods + verbs: + - get +# This is needed for the launcher Role. +- apiGroups: + - "" + resources: + - pods/exec + verbs: + - create +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - create + - list + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - list + - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - create + - get +- apiGroups: + - kubeflow.org + resources: + - mpijobs + verbs: + - "*" +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: mpi-operator + namespace: mpi-operator +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: mpi-operator + namespace: mpi-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mpi-operator +subjects: +- kind: ServiceAccount + name: mpi-operator + namespace: mpi-operator diff --git a/deploy/3-mpi-operator.yaml b/deploy/3-mpi-operator.yaml new file mode 100644 index 0000000..6cbb140 --- /dev/null +++ b/deploy/3-mpi-operator.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mpi-operator + namespace: mpi-operator + labels: + app: mpi-operator +spec: + replicas: 1 + selector: + matchLabels: + app: mpi-operator + template: + metadata: + labels: + app: mpi-operator + spec: + serviceAccountName: mpi-operator + containers: + - name: mpi-operator + image: rongou/mpi-operator:0.1.0 + args: [ + "--gpus-per-node", "8", + "--kubectl-delivery-image", + "rongou/kubectl-delivery:0.1.0" + ] + imagePullPolicy: Always diff --git a/examples/tensorflow-benchmarks.yaml b/examples/tensorflow-benchmarks.yaml new file mode 100644 index 0000000..6d4ce5c --- /dev/null +++ b/examples/tensorflow-benchmarks.yaml @@ -0,0 +1,12 @@ +# This file shows how to run multi-node training benchmarks using an MPIJob. +apiVersion: kubeflow.org/v1alpha1 +kind: MPIJob +metadata: + name: tensorflow-benchmarks-16 +spec: + gpus: 16 + template: + spec: + containers: + - image: rongou/tensorflow_benchmarks:latest + name: tensorflow-benchmarks diff --git a/examples/tensorflow-benchmarks/Dockerfile b/examples/tensorflow-benchmarks/Dockerfile new file mode 100644 index 0000000..0cf5a00 --- /dev/null +++ b/examples/tensorflow-benchmarks/Dockerfile @@ -0,0 +1,12 @@ +FROM rongou/horovod + +RUN mkdir /tensorflow +WORKDIR "/tensorflow" +RUN git clone https://github.com/tensorflow/benchmarks +WORKDIR "/tensorflow/benchmarks" + +CMD mpirun \ + python scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \ + --model resnet101 \ + --batch_size 64 \ + --variable_update horovod diff --git a/hack/custom-boilerplate.go.txt b/hack/custom-boilerplate.go.txt new file mode 100644 index 0000000..c2dc160 --- /dev/null +++ b/hack/custom-boilerplate.go.txt @@ -0,0 +1,13 @@ +// Copyright YEAR The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh new file mode 100755 index 0000000..2e5f201 --- /dev/null +++ b/hack/update-codegen.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright 2018 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/.. + +vendor/k8s.io/code-generator/generate-groups.sh "deepcopy,client,informer,lister" \ + github.com/kubeflow/mpi-operator/pkg/client github.com/kubeflow/mpi-operator/pkg/apis \ + kubeflow:v1alpha1 \ + --go-header-file ${SCRIPT_ROOT}/hack/custom-boilerplate.go.txt diff --git a/hack/verify-codegen.sh b/hack/verify-codegen.sh new file mode 100755 index 0000000..e7c38b4 --- /dev/null +++ b/hack/verify-codegen.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Copyright 2018 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. + +DIFFROOT="${SCRIPT_ROOT}/pkg" +TMP_DIFFROOT="${SCRIPT_ROOT}/_tmp/pkg" +_tmp="${SCRIPT_ROOT}/_tmp" + +cleanup() { + rm -rf "${_tmp}" +} +trap "cleanup" EXIT SIGINT + +cleanup + +mkdir -p "${TMP_DIFFROOT}" +cp -a "${DIFFROOT}"/* "${TMP_DIFFROOT}" + +"${SCRIPT_ROOT}/hack/update-codegen.sh" +echo "diffing ${DIFFROOT} against freshly generated codegen" +ret=0 +diff -Naupr "${DIFFROOT}" "${TMP_DIFFROOT}" || ret=$? +cp -a "${TMP_DIFFROOT}"/* "${DIFFROOT}" +if [[ $ret -eq 0 ]] +then + echo "${DIFFROOT} up to date." +else + echo "${DIFFROOT} is out of date. Please run hack/update-codegen.sh" + exit 1 +fi diff --git a/pkg/apis/kubeflow/v1alpha1/doc.go b/pkg/apis/kubeflow/v1alpha1/doc.go new file mode 100644 index 0000000..d6cbf05 --- /dev/null +++ b/pkg/apis/kubeflow/v1alpha1/doc.go @@ -0,0 +1,17 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +k8s:deepcopy-gen=package +// +groupName=kubeflow.org +package v1alpha1 diff --git a/pkg/apis/kubeflow/v1alpha1/register.go b/pkg/apis/kubeflow/v1alpha1/register.go new file mode 100644 index 0000000..7ed7113 --- /dev/null +++ b/pkg/apis/kubeflow/v1alpha1/register.go @@ -0,0 +1,49 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + version = "v1alpha1" + groupName = "kubeflow.org" + kind = "MPIJob" +) + +var ( + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + AddToScheme = SchemeBuilder.AddToScheme + SchemeGroupVersion = schema.GroupVersion{Group: groupName, Version: version} + SchemeGroupVersionKind = schema.GroupVersionKind{Group: groupName, Version: version, Kind: kind} +) + +// Resource takes an unqualified resource and returns a Group qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &MPIJob{}, + &MPIJobList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/pkg/apis/kubeflow/v1alpha1/types.go b/pkg/apis/kubeflow/v1alpha1/types.go new file mode 100644 index 0000000..9eaa2a5 --- /dev/null +++ b/pkg/apis/kubeflow/v1alpha1/types.go @@ -0,0 +1,69 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +type MPIJob struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + Spec MPIJobSpec `json:"spec,omitempty"` + Status MPIJobStatus `json:"status,omitempty"` +} + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +type MPIJobList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata"` + Items []MPIJob `json:"items"` +} + +type MPIJobSpec struct { + // Specifies the desired number of GPUs the MPIJob should run on. + // +optional + GPUs *int32 `json:"gpus,omitempty"` + + // Describes the pod that will be created when executing an MPIJob. + Template corev1.PodTemplateSpec `json:"template,omitempty"` +} + +type MPIJobLauncherStatusType string + +// These are valid launcher statuses of an MPIJob. +const ( + // LauncherActive means the MPIJob launcher is actively running. + LauncherActive MPIJobLauncherStatusType = "Active" + // LauncherSucceeded means the MPIJob launcher has succeeded. + LauncherSucceeded MPIJobLauncherStatusType = "Succeeded" + // LauncherFailed means the MPIJob launcher has failed its execution. + LauncherFailed MPIJobLauncherStatusType = "Failed" +) + +type MPIJobStatus struct { + // Current status of the launcher job. + // +optional + LauncherStatus MPIJobLauncherStatusType `json:"launcherStatus,omitempty"` + + // The number of available worker replicas. + // +optional + WorkerReplicas int32 `json:"workerReplicas,omitempty"` +} diff --git a/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000..6f7fa6b --- /dev/null +++ b/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,126 @@ +// +build !ignore_autogenerated + +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPIJob) DeepCopyInto(out *MPIJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIJob. +func (in *MPIJob) DeepCopy() *MPIJob { + if in == nil { + return nil + } + out := new(MPIJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MPIJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPIJobList) DeepCopyInto(out *MPIJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + out.ListMeta = in.ListMeta + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MPIJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIJobList. +func (in *MPIJobList) DeepCopy() *MPIJobList { + if in == nil { + return nil + } + out := new(MPIJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MPIJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPIJobSpec) DeepCopyInto(out *MPIJobSpec) { + *out = *in + if in.GPUs != nil { + in, out := &in.GPUs, &out.GPUs + if *in == nil { + *out = nil + } else { + *out = new(int32) + **out = **in + } + } + in.Template.DeepCopyInto(&out.Template) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIJobSpec. +func (in *MPIJobSpec) DeepCopy() *MPIJobSpec { + if in == nil { + return nil + } + out := new(MPIJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPIJobStatus) DeepCopyInto(out *MPIJobStatus) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIJobStatus. +func (in *MPIJobStatus) DeepCopy() *MPIJobStatus { + if in == nil { + return nil + } + out := new(MPIJobStatus) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go new file mode 100644 index 0000000..13df929 --- /dev/null +++ b/pkg/client/clientset/versioned/clientset.go @@ -0,0 +1,96 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package versioned + +import ( + kubeflowv1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + discovery "k8s.io/client-go/discovery" + rest "k8s.io/client-go/rest" + flowcontrol "k8s.io/client-go/util/flowcontrol" +) + +type Interface interface { + Discovery() discovery.DiscoveryInterface + KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface + // Deprecated: please explicitly pick a version if possible. + Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface +} + +// Clientset contains the clients for groups. Each group has exactly one +// version included in a Clientset. +type Clientset struct { + *discovery.DiscoveryClient + kubeflowV1alpha1 *kubeflowv1alpha1.KubeflowV1alpha1Client +} + +// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client +func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return c.kubeflowV1alpha1 +} + +// Deprecated: Kubeflow retrieves the default version of KubeflowClient. +// Please explicitly pick a version. +func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return c.kubeflowV1alpha1 +} + +// Discovery retrieves the DiscoveryClient +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + if c == nil { + return nil + } + return c.DiscoveryClient +} + +// NewForConfig creates a new Clientset for the given config. +func NewForConfig(c *rest.Config) (*Clientset, error) { + configShallowCopy := *c + if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { + configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) + } + var cs Clientset + var err error + cs.kubeflowV1alpha1, err = kubeflowv1alpha1.NewForConfig(&configShallowCopy) + if err != nil { + return nil, err + } + + cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy) + if err != nil { + return nil, err + } + return &cs, nil +} + +// NewForConfigOrDie creates a new Clientset for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *Clientset { + var cs Clientset + cs.kubeflowV1alpha1 = kubeflowv1alpha1.NewForConfigOrDie(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) + return &cs +} + +// New creates a new Clientset for the given RESTClient. +func New(c rest.Interface) *Clientset { + var cs Clientset + cs.kubeflowV1alpha1 = kubeflowv1alpha1.New(c) + + cs.DiscoveryClient = discovery.NewDiscoveryClient(c) + return &cs +} diff --git a/pkg/client/clientset/versioned/doc.go b/pkg/client/clientset/versioned/doc.go new file mode 100644 index 0000000..b31a069 --- /dev/null +++ b/pkg/client/clientset/versioned/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated clientset. +package versioned diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go new file mode 100644 index 0000000..d7c599d --- /dev/null +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -0,0 +1,80 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + clientset "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + kubeflowv1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + fakekubeflowv1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/discovery" + fakediscovery "k8s.io/client-go/discovery/fake" + "k8s.io/client-go/testing" +) + +// NewSimpleClientset returns a clientset that will respond with the provided objects. +// It's backed by a very simple object tracker that processes creates, updates and deletions as-is, +// without applying any validations and/or defaults. It shouldn't be considered a replacement +// for a real clientset and is mostly useful in simple unit tests. +func NewSimpleClientset(objects ...runtime.Object) *Clientset { + o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) + for _, obj := range objects { + if err := o.Add(obj); err != nil { + panic(err) + } + } + + cs := &Clientset{} + cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} + cs.AddReactor("*", "*", testing.ObjectReaction(o)) + cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { + gvr := action.GetResource() + ns := action.GetNamespace() + watch, err := o.Watch(gvr, ns) + if err != nil { + return false, nil, err + } + return true, watch, nil + }) + + return cs +} + +// Clientset implements clientset.Interface. Meant to be embedded into a +// struct to get a default implementation. This makes faking out just the method +// you want to test easier. +type Clientset struct { + testing.Fake + discovery *fakediscovery.FakeDiscovery +} + +func (c *Clientset) Discovery() discovery.DiscoveryInterface { + return c.discovery +} + +var _ clientset.Interface = &Clientset{} + +// KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client +func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} +} + +// Kubeflow retrieves the KubeflowV1alpha1Client +func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { + return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} +} diff --git a/pkg/client/clientset/versioned/fake/doc.go b/pkg/client/clientset/versioned/fake/doc.go new file mode 100644 index 0000000..305a862 --- /dev/null +++ b/pkg/client/clientset/versioned/fake/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated fake clientset. +package fake diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go new file mode 100644 index 0000000..6cfebea --- /dev/null +++ b/pkg/client/clientset/versioned/fake/register.go @@ -0,0 +1,52 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + kubeflowv1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" +) + +var scheme = runtime.NewScheme() +var codecs = serializer.NewCodecFactory(scheme) +var parameterCodec = runtime.NewParameterCodec(scheme) + +func init() { + v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) + AddToScheme(scheme) +} + +// AddToScheme adds all types of this clientset into the given scheme. This allows composition +// of clientsets, like in: +// +// import ( +// "k8s.io/client-go/kubernetes" +// clientsetscheme "k8s.io/client-go/kubernetes/scheme" +// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" +// ) +// +// kclientset, _ := kubernetes.NewForConfig(c) +// aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) +// +// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types +// correctly. +func AddToScheme(scheme *runtime.Scheme) { + kubeflowv1alpha1.AddToScheme(scheme) +} diff --git a/pkg/client/clientset/versioned/scheme/doc.go b/pkg/client/clientset/versioned/scheme/doc.go new file mode 100644 index 0000000..31002d7 --- /dev/null +++ b/pkg/client/clientset/versioned/scheme/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// This package contains the scheme of the automatically generated clientset. +package scheme diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go new file mode 100644 index 0000000..2ddd18a --- /dev/null +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -0,0 +1,52 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package scheme + +import ( + kubeflowv1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" +) + +var Scheme = runtime.NewScheme() +var Codecs = serializer.NewCodecFactory(Scheme) +var ParameterCodec = runtime.NewParameterCodec(Scheme) + +func init() { + v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) + AddToScheme(Scheme) +} + +// AddToScheme adds all types of this clientset into the given scheme. This allows composition +// of clientsets, like in: +// +// import ( +// "k8s.io/client-go/kubernetes" +// clientsetscheme "k8s.io/client-go/kubernetes/scheme" +// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" +// ) +// +// kclientset, _ := kubernetes.NewForConfig(c) +// aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) +// +// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types +// correctly. +func AddToScheme(scheme *runtime.Scheme) { + kubeflowv1alpha1.AddToScheme(scheme) +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go new file mode 100644 index 0000000..7070f8e --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated typed clients. +package v1alpha1 diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go new file mode 100644 index 0000000..64c7b9f --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/doc.go @@ -0,0 +1,18 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go new file mode 100644 index 0000000..403a887 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_kubeflow_client.go @@ -0,0 +1,38 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeKubeflowV1alpha1 struct { + *testing.Fake +} + +func (c *FakeKubeflowV1alpha1) MPIJobs(namespace string) v1alpha1.MPIJobInterface { + return &FakeMPIJobs{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeKubeflowV1alpha1) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_mpijob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_mpijob.go new file mode 100644 index 0000000..1dac322 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/fake/fake_mpijob.go @@ -0,0 +1,138 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeMPIJobs implements MPIJobInterface +type FakeMPIJobs struct { + Fake *FakeKubeflowV1alpha1 + ns string +} + +var mpijobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha1", Resource: "mpijobs"} + +var mpijobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha1", Kind: "MPIJob"} + +// Get takes name of the mPIJob, and returns the corresponding mPIJob object, and an error if there is any. +func (c *FakeMPIJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.MPIJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(mpijobsResource, c.ns, name), &v1alpha1.MPIJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.MPIJob), err +} + +// List takes label and field selectors, and returns the list of MPIJobs that match those selectors. +func (c *FakeMPIJobs) List(opts v1.ListOptions) (result *v1alpha1.MPIJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(mpijobsResource, mpijobsKind, c.ns, opts), &v1alpha1.MPIJobList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.MPIJobList{ListMeta: obj.(*v1alpha1.MPIJobList).ListMeta} + for _, item := range obj.(*v1alpha1.MPIJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested mPIJobs. +func (c *FakeMPIJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(mpijobsResource, c.ns, opts)) + +} + +// Create takes the representation of a mPIJob and creates it. Returns the server's representation of the mPIJob, and an error, if there is any. +func (c *FakeMPIJobs) Create(mPIJob *v1alpha1.MPIJob) (result *v1alpha1.MPIJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(mpijobsResource, c.ns, mPIJob), &v1alpha1.MPIJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.MPIJob), err +} + +// Update takes the representation of a mPIJob and updates it. Returns the server's representation of the mPIJob, and an error, if there is any. +func (c *FakeMPIJobs) Update(mPIJob *v1alpha1.MPIJob) (result *v1alpha1.MPIJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(mpijobsResource, c.ns, mPIJob), &v1alpha1.MPIJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.MPIJob), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakeMPIJobs) UpdateStatus(mPIJob *v1alpha1.MPIJob) (*v1alpha1.MPIJob, error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateSubresourceAction(mpijobsResource, "status", c.ns, mPIJob), &v1alpha1.MPIJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.MPIJob), err +} + +// Delete takes name of the mPIJob and deletes it. Returns an error if one occurs. +func (c *FakeMPIJobs) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(mpijobsResource, c.ns, name), &v1alpha1.MPIJob{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeMPIJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(mpijobsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &v1alpha1.MPIJobList{}) + return err +} + +// Patch applies the patch and returns the patched mPIJob. +func (c *FakeMPIJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.MPIJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(mpijobsResource, c.ns, name, data, subresources...), &v1alpha1.MPIJob{}) + + if obj == nil { + return nil, err + } + return obj.(*v1alpha1.MPIJob), err +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go new file mode 100644 index 0000000..b9bc5fd --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/generated_expansion.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +type MPIJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go new file mode 100644 index 0000000..ccfc579 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/kubeflow_client.go @@ -0,0 +1,88 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type KubeflowV1alpha1Interface interface { + RESTClient() rest.Interface + MPIJobsGetter +} + +// KubeflowV1alpha1Client is used to interact with features provided by the kubeflow.org group. +type KubeflowV1alpha1Client struct { + restClient rest.Interface +} + +func (c *KubeflowV1alpha1Client) MPIJobs(namespace string) MPIJobInterface { + return newMPIJobs(c, namespace) +} + +// NewForConfig creates a new KubeflowV1alpha1Client for the given config. +func NewForConfig(c *rest.Config) (*KubeflowV1alpha1Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &KubeflowV1alpha1Client{client}, nil +} + +// NewForConfigOrDie creates a new KubeflowV1alpha1Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha1Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new KubeflowV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *KubeflowV1alpha1Client { + return &KubeflowV1alpha1Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1alpha1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *KubeflowV1alpha1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/mpijob.go b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/mpijob.go new file mode 100644 index 0000000..c2783f3 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/kubeflow/v1alpha1/mpijob.go @@ -0,0 +1,172 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + scheme "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// MPIJobsGetter has a method to return a MPIJobInterface. +// A group's client should implement this interface. +type MPIJobsGetter interface { + MPIJobs(namespace string) MPIJobInterface +} + +// MPIJobInterface has methods to work with MPIJob resources. +type MPIJobInterface interface { + Create(*v1alpha1.MPIJob) (*v1alpha1.MPIJob, error) + Update(*v1alpha1.MPIJob) (*v1alpha1.MPIJob, error) + UpdateStatus(*v1alpha1.MPIJob) (*v1alpha1.MPIJob, error) + Delete(name string, options *v1.DeleteOptions) error + DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error + Get(name string, options v1.GetOptions) (*v1alpha1.MPIJob, error) + List(opts v1.ListOptions) (*v1alpha1.MPIJobList, error) + Watch(opts v1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.MPIJob, err error) + MPIJobExpansion +} + +// mPIJobs implements MPIJobInterface +type mPIJobs struct { + client rest.Interface + ns string +} + +// newMPIJobs returns a MPIJobs +func newMPIJobs(c *KubeflowV1alpha1Client, namespace string) *mPIJobs { + return &mPIJobs{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the mPIJob, and returns the corresponding mPIJob object, and an error if there is any. +func (c *mPIJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.MPIJob, err error) { + result = &v1alpha1.MPIJob{} + err = c.client.Get(). + Namespace(c.ns). + Resource("mpijobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of MPIJobs that match those selectors. +func (c *mPIJobs) List(opts v1.ListOptions) (result *v1alpha1.MPIJobList, err error) { + result = &v1alpha1.MPIJobList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("mpijobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested mPIJobs. +func (c *mPIJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("mpijobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a mPIJob and creates it. Returns the server's representation of the mPIJob, and an error, if there is any. +func (c *mPIJobs) Create(mPIJob *v1alpha1.MPIJob) (result *v1alpha1.MPIJob, err error) { + result = &v1alpha1.MPIJob{} + err = c.client.Post(). + Namespace(c.ns). + Resource("mpijobs"). + Body(mPIJob). + Do(). + Into(result) + return +} + +// Update takes the representation of a mPIJob and updates it. Returns the server's representation of the mPIJob, and an error, if there is any. +func (c *mPIJobs) Update(mPIJob *v1alpha1.MPIJob) (result *v1alpha1.MPIJob, err error) { + result = &v1alpha1.MPIJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("mpijobs"). + Name(mPIJob.Name). + Body(mPIJob). + Do(). + Into(result) + return +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + +func (c *mPIJobs) UpdateStatus(mPIJob *v1alpha1.MPIJob) (result *v1alpha1.MPIJob, err error) { + result = &v1alpha1.MPIJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("mpijobs"). + Name(mPIJob.Name). + SubResource("status"). + Body(mPIJob). + Do(). + Into(result) + return +} + +// Delete takes name of the mPIJob and deletes it. Returns an error if one occurs. +func (c *mPIJobs) Delete(name string, options *v1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("mpijobs"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *mPIJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("mpijobs"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched mPIJob. +func (c *mPIJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.MPIJob, err error) { + result = &v1alpha1.MPIJob{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("mpijobs"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/informers/externalversions/factory.go b/pkg/client/informers/externalversions/factory.go new file mode 100644 index 0000000..1b7db10 --- /dev/null +++ b/pkg/client/informers/externalversions/factory.go @@ -0,0 +1,178 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package externalversions + +import ( + reflect "reflect" + sync "sync" + time "time" + + versioned "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/internalinterfaces" + kubeflow "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/kubeflow" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" + cache "k8s.io/client-go/tools/cache" +) + +// SharedInformerOption defines the functional option type for SharedInformerFactory. +type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory + +type sharedInformerFactory struct { + client versioned.Interface + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc + lock sync.Mutex + defaultResync time.Duration + customResync map[reflect.Type]time.Duration + + informers map[reflect.Type]cache.SharedIndexInformer + // startedInformers is used for tracking which informers have been started. + // This allows Start() to be called multiple times safely. + startedInformers map[reflect.Type]bool +} + +// WithCustomResyncConfig sets a custom resync period for the specified informer types. +func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + for k, v := range resyncConfig { + factory.customResync[reflect.TypeOf(k)] = v + } + return factory + } +} + +// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. +func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + factory.tweakListOptions = tweakListOptions + return factory + } +} + +// WithNamespace limits the SharedInformerFactory to the specified namespace. +func WithNamespace(namespace string) SharedInformerOption { + return func(factory *sharedInformerFactory) *sharedInformerFactory { + factory.namespace = namespace + return factory + } +} + +// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. +func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { + return NewSharedInformerFactoryWithOptions(client, defaultResync) +} + +// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory. +// Listers obtained via this SharedInformerFactory will be subject to the same filters +// as specified here. +// Deprecated: Please use NewSharedInformerFactoryWithOptions instead +func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory { + return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions)) +} + +// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. +func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory { + factory := &sharedInformerFactory{ + client: client, + namespace: v1.NamespaceAll, + defaultResync: defaultResync, + informers: make(map[reflect.Type]cache.SharedIndexInformer), + startedInformers: make(map[reflect.Type]bool), + customResync: make(map[reflect.Type]time.Duration), + } + + // Apply all options + for _, opt := range options { + factory = opt(factory) + } + + return factory +} + +// Start initializes all requested informers. +func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { + f.lock.Lock() + defer f.lock.Unlock() + + for informerType, informer := range f.informers { + if !f.startedInformers[informerType] { + go informer.Run(stopCh) + f.startedInformers[informerType] = true + } + } +} + +// WaitForCacheSync waits for all started informers' cache were synced. +func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { + informers := func() map[reflect.Type]cache.SharedIndexInformer { + f.lock.Lock() + defer f.lock.Unlock() + + informers := map[reflect.Type]cache.SharedIndexInformer{} + for informerType, informer := range f.informers { + if f.startedInformers[informerType] { + informers[informerType] = informer + } + } + return informers + }() + + res := map[reflect.Type]bool{} + for informType, informer := range informers { + res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) + } + return res +} + +// InternalInformerFor returns the SharedIndexInformer for obj using an internal +// client. +func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { + f.lock.Lock() + defer f.lock.Unlock() + + informerType := reflect.TypeOf(obj) + informer, exists := f.informers[informerType] + if exists { + return informer + } + + resyncPeriod, exists := f.customResync[informerType] + if !exists { + resyncPeriod = f.defaultResync + } + + informer = newFunc(f.client, resyncPeriod) + f.informers[informerType] = informer + + return informer +} + +// SharedInformerFactory provides shared informers for resources in all known +// API group versions. +type SharedInformerFactory interface { + internalinterfaces.SharedInformerFactory + ForResource(resource schema.GroupVersionResource) (GenericInformer, error) + WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool + + Kubeflow() kubeflow.Interface +} + +func (f *sharedInformerFactory) Kubeflow() kubeflow.Interface { + return kubeflow.New(f, f.namespace, f.tweakListOptions) +} diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go new file mode 100644 index 0000000..ce667b9 --- /dev/null +++ b/pkg/client/informers/externalversions/generic.go @@ -0,0 +1,60 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package externalversions + +import ( + "fmt" + + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + schema "k8s.io/apimachinery/pkg/runtime/schema" + cache "k8s.io/client-go/tools/cache" +) + +// GenericInformer is type of SharedIndexInformer which will locate and delegate to other +// sharedInformers based on type +type GenericInformer interface { + Informer() cache.SharedIndexInformer + Lister() cache.GenericLister +} + +type genericInformer struct { + informer cache.SharedIndexInformer + resource schema.GroupResource +} + +// Informer returns the SharedIndexInformer. +func (f *genericInformer) Informer() cache.SharedIndexInformer { + return f.informer +} + +// Lister returns the GenericLister. +func (f *genericInformer) Lister() cache.GenericLister { + return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) +} + +// ForResource gives generic access to a shared informer of the matching type +// TODO extend this to unknown resources with a client pool +func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { + switch resource { + // Group=kubeflow.org, Version=v1alpha1 + case v1alpha1.SchemeGroupVersion.WithResource("mpijobs"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha1().MPIJobs().Informer()}, nil + + } + + return nil, fmt.Errorf("no informer found for %v", resource) +} diff --git a/pkg/client/informers/externalversions/internalinterfaces/factory_interfaces.go b/pkg/client/informers/externalversions/internalinterfaces/factory_interfaces.go new file mode 100644 index 0000000..cdb777d --- /dev/null +++ b/pkg/client/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -0,0 +1,36 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package internalinterfaces + +import ( + time "time" + + versioned "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + cache "k8s.io/client-go/tools/cache" +) + +type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer + +// SharedInformerFactory a small interface to allow for adding an informer without an import cycle +type SharedInformerFactory interface { + Start(stopCh <-chan struct{}) + InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer +} + +type TweakListOptionsFunc func(*v1.ListOptions) diff --git a/pkg/client/informers/externalversions/kubeflow/interface.go b/pkg/client/informers/externalversions/kubeflow/interface.go new file mode 100644 index 0000000..f945219 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/interface.go @@ -0,0 +1,44 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package kubeflow + +import ( + internalinterfaces "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/kubeflow/v1alpha1" +) + +// Interface provides access to each of this group's versions. +type Interface interface { + // V1alpha1 provides access to shared informers for resources in V1alpha1. + V1alpha1() v1alpha1.Interface +} + +type group struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// V1alpha1 returns a new v1alpha1.Interface. +func (g *group) V1alpha1() v1alpha1.Interface { + return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) +} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go new file mode 100644 index 0000000..2991de7 --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha1/interface.go @@ -0,0 +1,43 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + internalinterfaces "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // MPIJobs returns a MPIJobInformer. + MPIJobs() MPIJobInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// MPIJobs returns a MPIJobInformer. +func (v *version) MPIJobs() MPIJobInformer { + return &mPIJobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/pkg/client/informers/externalversions/kubeflow/v1alpha1/mpijob.go b/pkg/client/informers/externalversions/kubeflow/v1alpha1/mpijob.go new file mode 100644 index 0000000..4d861bf --- /dev/null +++ b/pkg/client/informers/externalversions/kubeflow/v1alpha1/mpijob.go @@ -0,0 +1,87 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + time "time" + + kubeflow_v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + versioned "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/internalinterfaces" + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/client/listers/kubeflow/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" +) + +// MPIJobInformer provides access to a shared informer and lister for +// MPIJobs. +type MPIJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1alpha1.MPIJobLister +} + +type mPIJobInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewMPIJobInformer constructs a new informer for MPIJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewMPIJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredMPIJobInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredMPIJobInformer constructs a new informer for MPIJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredMPIJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.KubeflowV1alpha1().MPIJobs(namespace).List(options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.KubeflowV1alpha1().MPIJobs(namespace).Watch(options) + }, + }, + &kubeflow_v1alpha1.MPIJob{}, + resyncPeriod, + indexers, + ) +} + +func (f *mPIJobInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredMPIJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *mPIJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&kubeflow_v1alpha1.MPIJob{}, f.defaultInformer) +} + +func (f *mPIJobInformer) Lister() v1alpha1.MPIJobLister { + return v1alpha1.NewMPIJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go new file mode 100644 index 0000000..f5fd688 --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha1/expansion_generated.go @@ -0,0 +1,25 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +// MPIJobListerExpansion allows custom methods to be added to +// MPIJobLister. +type MPIJobListerExpansion interface{} + +// MPIJobNamespaceListerExpansion allows custom methods to be added to +// MPIJobNamespaceLister. +type MPIJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/kubeflow/v1alpha1/mpijob.go b/pkg/client/listers/kubeflow/v1alpha1/mpijob.go new file mode 100644 index 0000000..88acbe0 --- /dev/null +++ b/pkg/client/listers/kubeflow/v1alpha1/mpijob.go @@ -0,0 +1,92 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1alpha1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// MPIJobLister helps list MPIJobs. +type MPIJobLister interface { + // List lists all MPIJobs in the indexer. + List(selector labels.Selector) (ret []*v1alpha1.MPIJob, err error) + // MPIJobs returns an object that can list and get MPIJobs. + MPIJobs(namespace string) MPIJobNamespaceLister + MPIJobListerExpansion +} + +// mPIJobLister implements the MPIJobLister interface. +type mPIJobLister struct { + indexer cache.Indexer +} + +// NewMPIJobLister returns a new MPIJobLister. +func NewMPIJobLister(indexer cache.Indexer) MPIJobLister { + return &mPIJobLister{indexer: indexer} +} + +// List lists all MPIJobs in the indexer. +func (s *mPIJobLister) List(selector labels.Selector) (ret []*v1alpha1.MPIJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.MPIJob)) + }) + return ret, err +} + +// MPIJobs returns an object that can list and get MPIJobs. +func (s *mPIJobLister) MPIJobs(namespace string) MPIJobNamespaceLister { + return mPIJobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// MPIJobNamespaceLister helps list and get MPIJobs. +type MPIJobNamespaceLister interface { + // List lists all MPIJobs in the indexer for a given namespace. + List(selector labels.Selector) (ret []*v1alpha1.MPIJob, err error) + // Get retrieves the MPIJob from the indexer for a given namespace and name. + Get(name string) (*v1alpha1.MPIJob, error) + MPIJobNamespaceListerExpansion +} + +// mPIJobNamespaceLister implements the MPIJobNamespaceLister +// interface. +type mPIJobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all MPIJobs in the indexer for a given namespace. +func (s mPIJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.MPIJob, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1alpha1.MPIJob)) + }) + return ret, err +} + +// Get retrieves the MPIJob from the indexer for a given namespace and name. +func (s mPIJobNamespaceLister) Get(name string) (*v1alpha1.MPIJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1alpha1.Resource("mpijob"), name) + } + return obj.(*v1alpha1.MPIJob), nil +} diff --git a/pkg/controllers/mpi_job_controller.go b/pkg/controllers/mpi_job_controller.go new file mode 100644 index 0000000..876f00c --- /dev/null +++ b/pkg/controllers/mpi_job_controller.go @@ -0,0 +1,997 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + "bytes" + "fmt" + "time" + + "github.com/golang/glog" + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + appsinformers "k8s.io/client-go/informers/apps/v1" + batchinformers "k8s.io/client-go/informers/batch/v1" + coreinformers "k8s.io/client-go/informers/core/v1" + rbacinformers "k8s.io/client-go/informers/rbac/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + appslisters "k8s.io/client-go/listers/apps/v1" + batchlisters "k8s.io/client-go/listers/batch/v1" + corelisters "k8s.io/client-go/listers/core/v1" + rbaclisters "k8s.io/client-go/listers/rbac/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/workqueue" + + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + clientset "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned" + kubeflowScheme "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/scheme" + informers "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions/kubeflow/v1alpha1" + listers "github.com/kubeflow/mpi-operator/pkg/client/listers/kubeflow/v1alpha1" +) + +const ( + controllerAgentName = "mpi-job-controller" + configSuffix = "-config" + configVolumeName = "mpi-job-config" + configMountPath = "/etc/mpi" + kubexecScriptName = "kubexec.sh" + hostfileName = "hostfile" + kubectlDeliveryName = "kubectl-delivery" + kubectlTargetDirEnv = "TARGET_DIR" + kubectlVolumeName = "mpi-job-kubectl" + kubectlMountPath = "/opt/kube" + launcherSuffix = "-launcher" + workerSuffix = "-worker" + gpuResourceName = "nvidia.com/gpu" +) + +const ( + // SuccessSynced is used as part of the Event 'reason' when an MPIJob is + // synced. + SuccessSynced = "Synced" + // ErrResourceExists is used as part of the Event 'reason' when an MPIJob + // fails to sync due to dependent resources of the same name already + // existing. + ErrResourceExists = "ErrResourceExists" + + // MessageResourceExists is the message used for Events when a resource + // fails to sync due to dependent resources already existing. + MessageResourceExists = "Resource %q already exists and is not managed by MPIJob" + // MessageResourceSynced is the message used for an Event fired when an + // MPIJob is synced successfully. + MessageResourceSynced = "MPIJob synced successfully" +) + +// MPIJobController is the controller implementation for MPIJob resources. +type MPIJobController struct { + // kubeClient is a standard kubernetes clientset. + kubeClient kubernetes.Interface + // kubeflowClient is a clientset for our own API group. + kubeflowClient clientset.Interface + + configMapLister corelisters.ConfigMapLister + configMapSynced cache.InformerSynced + serviceAccountLister corelisters.ServiceAccountLister + serviceAccountSynced cache.InformerSynced + roleLister rbaclisters.RoleLister + roleSynced cache.InformerSynced + roleBindingLister rbaclisters.RoleBindingLister + roleBindingSynced cache.InformerSynced + statefulSetLister appslisters.StatefulSetLister + statefulSetSynced cache.InformerSynced + jobLister batchlisters.JobLister + jobSynced cache.InformerSynced + mpiJobLister listers.MPIJobLister + mpiJobSynced cache.InformerSynced + + // queue is a rate limited work queue. This is used to queue work to be + // processed instead of performing it as soon as a change happens. This + // means we can ensure we only process a fixed amount of resources at a + // time, and makes it easy to ensure we are never processing the same item + // simultaneously in two different workers. + queue workqueue.RateLimitingInterface + // recorder is an event recorder for recording Event resources to the + // Kubernetes API. + recorder record.EventRecorder + // The maximum number of GPUs per node. + gpusPerNode int + // The container image used to deliver the kubectl binary. + kubectlDeliveryImage string +} + +// NewMPIJobController returns a new MPIJob controller. +func NewMPIJobController( + kubeClient kubernetes.Interface, + kubeflowClient clientset.Interface, + configMapInformer coreinformers.ConfigMapInformer, + serviceAccountInformer coreinformers.ServiceAccountInformer, + roleInformer rbacinformers.RoleInformer, + roleBindingInformer rbacinformers.RoleBindingInformer, + statefulSetInformer appsinformers.StatefulSetInformer, + jobInformer batchinformers.JobInformer, + mpiJobInformer informers.MPIJobInformer, + gpusPerNode int, + kubectlDeliveryImage string) *MPIJobController { + + // Create event broadcaster. + // Add mpi-job-controller types to the default Kubernetes Scheme so Events + // can be logged for mpi-job-controller types. + kubeflowScheme.AddToScheme(scheme.Scheme) + glog.V(4).Info("Creating event broadcaster") + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(glog.Infof) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: controllerAgentName}) + + controller := &MPIJobController{ + kubeClient: kubeClient, + kubeflowClient: kubeflowClient, + configMapLister: configMapInformer.Lister(), + configMapSynced: configMapInformer.Informer().HasSynced, + serviceAccountLister: serviceAccountInformer.Lister(), + serviceAccountSynced: serviceAccountInformer.Informer().HasSynced, + roleLister: roleInformer.Lister(), + roleSynced: roleInformer.Informer().HasSynced, + roleBindingLister: roleBindingInformer.Lister(), + roleBindingSynced: roleBindingInformer.Informer().HasSynced, + statefulSetLister: statefulSetInformer.Lister(), + statefulSetSynced: statefulSetInformer.Informer().HasSynced, + jobLister: jobInformer.Lister(), + jobSynced: jobInformer.Informer().HasSynced, + mpiJobLister: mpiJobInformer.Lister(), + mpiJobSynced: mpiJobInformer.Informer().HasSynced, + queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "MPIJobs"), + recorder: recorder, + gpusPerNode: gpusPerNode, + kubectlDeliveryImage: kubectlDeliveryImage, + } + + glog.Info("Setting up event handlers") + // Set up an event handler for when MPIJob resources change. + mpiJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.enqueueMPIJob, + UpdateFunc: func(old, new interface{}) { + controller.enqueueMPIJob(new) + }, + }) + + // Set up an event handler for when dependent resources change. This + // handler will lookup the owner of the given resource, and if it is + // owned by an MPIJob resource will enqueue that MPIJob resource for + // processing. This way, we don't need to implement custom logic for + // handling dependent resources. More info on this pattern: + // https://github.com/kubernetes/community/blob/8cafef897a22026d42f5e5bb3f104febe7e29830/contributors/devel/controllers.md + configMapInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newConfigMap := new.(*corev1.ConfigMap) + oldConfigMap := old.(*corev1.ConfigMap) + if newConfigMap.ResourceVersion == oldConfigMap.ResourceVersion { + // Periodic re-sync will send update events for all known + // ConfigMaps. Two different versions of the same ConfigMap + // will always have different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + serviceAccountInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newServiceAccount := new.(*corev1.ServiceAccount) + oldServiceAccount := old.(*corev1.ServiceAccount) + if newServiceAccount.ResourceVersion == oldServiceAccount.ResourceVersion { + // Periodic re-sync will send update events for all known + // ServiceAccounts. Two different versions of the same ServiceAccount + // will always have different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + roleInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newRole := new.(*rbacv1.Role) + oldRole := old.(*rbacv1.Role) + if newRole.ResourceVersion == oldRole.ResourceVersion { + // Periodic re-sync will send update events for all known + // Roles. Two different versions of the same Role + // will always have different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + roleBindingInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newRoleBinding := new.(*rbacv1.RoleBinding) + oldRoleBinding := old.(*rbacv1.RoleBinding) + if newRoleBinding.ResourceVersion == oldRoleBinding.ResourceVersion { + // Periodic re-sync will send update events for all known + // RoleBindings. Two different versions of the same RoleBinding + // will always have different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + statefulSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newStatefulSet := new.(*appsv1.StatefulSet) + oldStatefulSet := old.(*appsv1.StatefulSet) + if newStatefulSet.ResourceVersion == oldStatefulSet.ResourceVersion { + // Periodic re-sync will send update events for all known + // StatefulSets. Two different versions of the same StatefulSet + // will always have different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleObject, + UpdateFunc: func(old, new interface{}) { + newJob := new.(*batchv1.Job) + oldJob := old.(*batchv1.Job) + if newJob.ResourceVersion == oldJob.ResourceVersion { + // Periodic re-sync will send update events for all known Jobs. + // Two different versions of the same Job will always have + // different RVs. + return + } + controller.handleObject(new) + }, + DeleteFunc: controller.handleObject, + }) + + return controller +} + +// Run will set up the event handlers for types we are interested in, as well +// as syncing informer caches and starting workers. It will block until stopCh +// is closed, at which point it will shutdown the work queue and wait for +// workers to finish processing their current work items. +func (c *MPIJobController) Run(threadiness int, stopCh <-chan struct{}) error { + defer runtime.HandleCrash() + defer c.queue.ShutDown() + + // Start the informer factories to begin populating the informer caches. + glog.Info("Starting MPIJob controller") + + // Wait for the caches to be synced before starting workers. + glog.Info("Waiting for informer caches to sync") + if ok := cache.WaitForCacheSync(stopCh, c.configMapSynced, c.serviceAccountSynced, c.roleSynced, c.roleBindingSynced, c.statefulSetSynced, c.jobSynced, c.mpiJobSynced); !ok { + return fmt.Errorf("failed to wait for caches to sync") + } + + glog.Info("Starting workers") + // Launch workers to process MPIJob resources. + for i := 0; i < threadiness; i++ { + go wait.Until(c.runWorker, time.Second, stopCh) + } + + glog.Info("Started workers") + <-stopCh + glog.Info("Shutting down workers") + + return nil +} + +// runWorker is a long-running function that will continually call the +// processNextWorkItem function in order to read and process a message on the +// work queue. +func (c *MPIJobController) runWorker() { + for c.processNextWorkItem() { + } +} + +// processNextWorkItem will read a single work item off the work queue and +// attempt to process it, by calling the syncHandler. +func (c *MPIJobController) processNextWorkItem() bool { + obj, shutdown := c.queue.Get() + + if shutdown { + return false + } + + // We wrap this block in a func so we can defer c.queue.Done. + err := func(obj interface{}) error { + // We call Done here so the work queue knows we have finished + // processing this item. We also must remember to call Forget if we + // do not want this work item being re-queued. For example, we do + // not call Forget if a transient error occurs, instead the item is + // put back on the work queue and attempted again after a back-off + // period. + defer c.queue.Done(obj) + var key string + var ok bool + // We expect strings to come off the work queue. These are of the + // form namespace/name. We do this as the delayed nature of the + // work queue means the items in the informer cache may actually be + // more up to date that when the item was initially put onto the + // work queue. + if key, ok = obj.(string); !ok { + // As the item in the work queue is actually invalid, we call + // Forget here else we'd go into a loop of attempting to + // process a work item that is invalid. + c.queue.Forget(obj) + runtime.HandleError(fmt.Errorf("expected string in workqueue but got %#v", obj)) + return nil + } + // Run the syncHandler, passing it the namespace/name string of the + // MPIJob resource to be synced. + if err := c.syncHandler(key); err != nil { + return fmt.Errorf("error syncing '%s': %s", key, err.Error()) + } + // Finally, if no error occurs we Forget this item so it does not + // get queued again until another change happens. + c.queue.Forget(obj) + glog.Infof("Successfully synced '%s'", key) + return nil + }(obj) + + if err != nil { + runtime.HandleError(err) + return true + } + + return true +} + +// syncHandler compares the actual state with the desired, and attempts to +// converge the two. It then updates the Status block of the MPIJob resource +// with the current status of the resource. +func (c *MPIJobController) syncHandler(key string) error { + // Convert the namespace/name string into a distinct namespace and name. + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + runtime.HandleError(fmt.Errorf("invalid resource key: %s", key)) + return nil + } + + // Get the MPIJob with this namespace/name. + mpiJob, err := c.mpiJobLister.MPIJobs(namespace).Get(name) + // The MPIJob may no longer exist, in which case we stop processing. + if errors.IsNotFound(err) { + runtime.HandleError(fmt.Errorf("mpi job '%s' in work queue no longer exists", key)) + return nil + } + if err != nil { + return err + } + + // Get the launcher Job for this MPIJob. + launcher, err := c.getLauncherJob(mpiJob) + if err != nil { + return err + } + // We're done if the launcher either succeeded or failed. + done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1) + + totalGPUs := getTotalGPUs(mpiJob) + workerReplicas := c.getWorkerReplicas(totalGPUs, done) + gpusPerWorker := totalGPUs + if totalGPUs > c.gpusPerNode { + gpusPerWorker = c.gpusPerNode + } + + if !done { + // Get the ConfigMap for this MPIJob. + if config, err := c.getConfigMap(mpiJob, workerReplicas, gpusPerWorker); config == nil || err != nil { + return err + } + + // Get the launcher ServiceAccount for this MPIJob. + if sa, err := c.getLauncherServiceAccount(mpiJob); sa == nil || err != nil { + return err + } + + // Get the launcher Role for this MPIJob. + if r, err := c.getLauncherRole(mpiJob, workerReplicas); r == nil || err != nil { + return err + } + + // Get the launcher RoleBinding for this MPIJob. + if rb, err := c.getLauncherRoleBinding(mpiJob); rb == nil || err != nil { + return err + } + } + + worker, err := c.getWorkerStatefulSet(mpiJob, workerReplicas) + if err != nil { + return err + } + + // If the worker is ready, start the launcher. + workerReady := workerReplicas == 0 || int(worker.Status.ReadyReplicas) == workerReplicas + if workerReady && launcher == nil { + launcherGPUs := totalGPUs + if launcherGPUs > c.gpusPerNode { + launcherGPUs = c.gpusPerNode + } + launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(newLauncher(mpiJob, launcherGPUs, c.kubectlDeliveryImage)) + if err != nil { + return err + } + } + + // Finally, we update the status block of the MPIJob resource to reflect the + // current state of the world. + err = c.updateMPIJobStatus(mpiJob, launcher, worker) + if err != nil { + return err + } + + c.recorder.Event(mpiJob, corev1.EventTypeNormal, SuccessSynced, MessageResourceSynced) + return nil +} + +// getLauncherJob gets the launcher Job controlled by this MPIJob. +func (c *MPIJobController) getLauncherJob(mpiJob *kubeflow.MPIJob) (*batchv1.Job, error) { + launcher, err := c.jobLister.Jobs(mpiJob.Namespace).Get(mpiJob.Name + launcherSuffix) + if errors.IsNotFound(err) { + return nil, nil + } + if err != nil { + // If an error occurs during Get, we'll requeue the item so we can + // attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + return nil, err + } + + // If the launcher is not controlled by this MPIJob resource, we should log + // a warning to the event recorder and return. + if !metav1.IsControlledBy(launcher, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, launcher.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return launcher, fmt.Errorf(msg) + } + + return launcher, nil +} + +// getTotalGPUs gets the total number of desired GPUs. Defaults to 1 if not specified. +func getTotalGPUs(mpiJob *kubeflow.MPIJob) int { + totalGPUs := 1 + if mpiJob.Spec.GPUs != nil { + totalGPUs = int(*mpiJob.Spec.GPUs) + } + return totalGPUs +} + +// getWorkerReplicas gets the desired number of worker replicas. +func (c *MPIJobController) getWorkerReplicas(totalGPUs int, done bool) int { + workerReplicas := 0 + if totalGPUs > c.gpusPerNode { + // The launcher also does work, so the # worker replicas needed is deducted by 1. + workerReplicas = totalGPUs/c.gpusPerNode - 1 + } + if done { + workerReplicas = 0 + } + return workerReplicas +} + +// getConfigMap gets the ConfigMap controlled by this MPIJob. +func (c *MPIJobController) getConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int, gpusPerWorker int) (*corev1.ConfigMap, error) { + cm, err := c.configMapLister.ConfigMaps(mpiJob.Namespace).Get(mpiJob.Name + configSuffix) + // If the ConfigMap doesn't exist, we'll create it. + if errors.IsNotFound(err) { + cm, err = c.kubeClient.CoreV1().ConfigMaps(mpiJob.Namespace).Create(newConfigMap(mpiJob, workerReplicas, gpusPerWorker)) + } + // If an error occurs during Get/Create, we'll requeue the item so we + // can attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil { + return nil, err + } + // If the ConfigMap is not controlled by this MPIJob resource, we + // should log a warning to the event recorder and return. + if !metav1.IsControlledBy(cm, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, cm.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return nil, fmt.Errorf(msg) + } + + return cm, nil +} + +// getLauncherServiceAccount gets the launcher ServiceAccount controlled by this MPIJob. +func (c *MPIJobController) getLauncherServiceAccount(mpiJob *kubeflow.MPIJob) (*corev1.ServiceAccount, error) { + sa, err := c.serviceAccountLister.ServiceAccounts(mpiJob.Namespace).Get(mpiJob.Name + launcherSuffix) + // If the ServiceAccount doesn't exist, we'll create it. + if errors.IsNotFound(err) { + sa, err = c.kubeClient.CoreV1().ServiceAccounts(mpiJob.Namespace).Create(newLauncherServiceAccount(mpiJob)) + } + // If an error occurs during Get/Create, we'll requeue the item so we + // can attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil { + return nil, err + } + // If the launcher ServiceAccount is not controlled by this MPIJob resource, we + // should log a warning to the event recorder and return. + if !metav1.IsControlledBy(sa, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, sa.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return nil, fmt.Errorf(msg) + } + + return sa, nil +} + +// getLauncherRole gets the launcher Role controlled by this MPIJob. +func (c *MPIJobController) getLauncherRole(mpiJob *kubeflow.MPIJob, workerReplicas int) (*rbacv1.Role, error) { + role, err := c.roleLister.Roles(mpiJob.Namespace).Get(mpiJob.Name + launcherSuffix) + // If the Role doesn't exist, we'll create it. + if errors.IsNotFound(err) { + role, err = c.kubeClient.RbacV1().Roles(mpiJob.Namespace).Create(newLauncherRole(mpiJob, workerReplicas)) + } + // If an error occurs during Get/Create, we'll requeue the item so we + // can attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil { + return nil, err + } + // If the launcher Role is not controlled by this MPIJob resource, we + // should log a warning to the event recorder and return. + if !metav1.IsControlledBy(role, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, role.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return nil, fmt.Errorf(msg) + } + + return role, nil +} + +// getLauncherRoleBinding gets the launcher RoleBinding controlled by this MPIJob. +func (c *MPIJobController) getLauncherRoleBinding(mpiJob *kubeflow.MPIJob) (*rbacv1.RoleBinding, error) { + rb, err := c.roleBindingLister.RoleBindings(mpiJob.Namespace).Get(mpiJob.Name + launcherSuffix) + // If the RoleBinding doesn't exist, we'll create it. + if errors.IsNotFound(err) { + rb, err = c.kubeClient.RbacV1().RoleBindings(mpiJob.Namespace).Create(newLauncherRoleBinding(mpiJob)) + } + // If an error occurs during Get/Create, we'll requeue the item so we + // can attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil { + return nil, err + } + // If the launcher RoleBinding is not controlled by this MPIJob resource, we + // should log a warning to the event recorder and return. + if !metav1.IsControlledBy(rb, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, rb.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return nil, fmt.Errorf(msg) + } + + return rb, nil +} + +// getWorkerStatefulSet gets the worker StatefulSet controlled by this MPIJob. +func (c *MPIJobController) getWorkerStatefulSet(mpiJob *kubeflow.MPIJob, workerReplicas int) (*appsv1.StatefulSet, error) { + worker, err := c.statefulSetLister.StatefulSets(mpiJob.Namespace).Get(mpiJob.Name + workerSuffix) + // If the StatefulSet doesn't exist, we'll create it. + if errors.IsNotFound(err) && workerReplicas > 0 { + worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Create(newWorker(mpiJob, int32(workerReplicas), c.gpusPerNode)) + } + // If an error occurs during Get/Create, we'll requeue the item so we + // can attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil && !errors.IsNotFound(err) { + return nil, err + } + + // If the worker is not controlled by this MPIJob resource, we should log + // a warning to the event recorder and return. + if worker != nil && !metav1.IsControlledBy(worker, mpiJob) { + msg := fmt.Sprintf(MessageResourceExists, worker.Name) + c.recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) + return nil, fmt.Errorf(msg) + } + + // If the worker is out of date, update the worker. + if worker != nil && int(*worker.Spec.Replicas) != workerReplicas { + worker, err = c.kubeClient.AppsV1().StatefulSets(mpiJob.Namespace).Update(newWorker(mpiJob, int32(workerReplicas), c.gpusPerNode)) + // If an error occurs during Update, we'll requeue the item so we can + // attempt processing again later. This could have been caused by a + // temporary network failure, or any other transient reason. + if err != nil { + return nil, err + } + } + + return worker, nil +} + +func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher *batchv1.Job, worker *appsv1.StatefulSet) error { + // NEVER modify objects from the store. It's a read-only, local cache. + // You can use DeepCopy() to make a deep copy of original object and modify this copy + // Or create a copy manually for better performance + mpiJobCopy := mpiJob.DeepCopy() + if launcher != nil { + if launcher.Status.Active > 0 { + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherActive + } else if launcher.Status.Succeeded > 0 { + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherSucceeded + } else if launcher.Status.Failed > 0 { + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherFailed + } + } + if worker != nil { + mpiJobCopy.Status.WorkerReplicas = worker.Status.ReadyReplicas + } + // Until #38113 is merged, we must use Update instead of UpdateStatus to + // update the Status block of the MPIJob resource. UpdateStatus will not + // allow changes to the Spec of the resource, which is ideal for ensuring + // nothing other than resource status has been updated. + _, err := c.kubeflowClient.KubeflowV1alpha1().MPIJobs(mpiJob.Namespace).Update(mpiJobCopy) + return err +} + +// enqueueMPIJob takes a MPIJob resource and converts it into a namespace/name +// string which is then put onto the work queue. This method should *not* be +// passed resources of any type other than MPIJob. +func (c *MPIJobController) enqueueMPIJob(obj interface{}) { + var key string + var err error + if key, err = cache.MetaNamespaceKeyFunc(obj); err != nil { + runtime.HandleError(err) + return + } + c.queue.AddRateLimited(key) +} + +// handleObject will take any resource implementing metav1.Object and attempt +// to find the MPIJob resource that 'owns' it. It does this by looking at the +// objects metadata.ownerReferences field for an appropriate OwnerReference. +// It then enqueues that MPIJob resource to be processed. If the object does not +// have an appropriate OwnerReference, it will simply be skipped. +func (c *MPIJobController) handleObject(obj interface{}) { + var object metav1.Object + var ok bool + if object, ok = obj.(metav1.Object); !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + runtime.HandleError(fmt.Errorf("error decoding object, invalid type")) + return + } + object, ok = tombstone.Obj.(metav1.Object) + if !ok { + runtime.HandleError(fmt.Errorf("error decoding object tombstone, invalid type")) + return + } + glog.V(4).Infof("Recovered deleted object '%s' from tombstone", object.GetName()) + } + glog.V(4).Infof("Processing object: %s", object.GetName()) + if ownerRef := metav1.GetControllerOf(object); ownerRef != nil { + // If this object is not owned by a MPIJob, we should not do anything + // more with it. + if ownerRef.Kind != "MPIJob" { + return + } + + mpiJob, err := c.mpiJobLister.MPIJobs(object.GetNamespace()).Get(ownerRef.Name) + if err != nil { + glog.V(4).Infof("ignoring orphaned object '%s' of mpi job '%s'", object.GetSelfLink(), ownerRef.Name) + return + } + + c.enqueueMPIJob(mpiJob) + return + } +} + +// newConfigMap creates a new ConfigMap containing configurations for an MPIJob +// resource. It also sets the appropriate OwnerReferences on the resource so +// handleObject can discover the MPIJob resource that 'owns' it. +func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int, gpusPerWorker int) *corev1.ConfigMap { + kubexec := fmt.Sprintf(`#!/bin/sh +set -x +POD_NAME=$1 +shift +%s/kubectl exec ${POD_NAME} -- /bin/sh -c "$*" +`, kubectlMountPath) + + var buffer bytes.Buffer + buffer.WriteString(fmt.Sprintf("localhost slots=%d max_slots=%d\n", gpusPerWorker, gpusPerWorker)) + for i := 0; i < workerReplicas; i++ { + buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d max_slots=%d\n", mpiJob.Name, workerSuffix, i, gpusPerWorker, gpusPerWorker)) + } + + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: mpiJob.Name + configSuffix, + Namespace: mpiJob.Namespace, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + Data: map[string]string{ + hostfileName: buffer.String(), + kubexecScriptName: kubexec, + }, + } +} + +// newLauncherServiceAccount creates a new launcher ServiceAccount for an MPIJob +// resource. It also sets the appropriate OwnerReferences on the resource so +// handleObject can discover the MPIJob resource that 'owns' it. +func newLauncherServiceAccount(mpiJob *kubeflow.MPIJob) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: mpiJob.Name + launcherSuffix, + Namespace: mpiJob.Namespace, + Labels: map[string]string{ + "app": mpiJob.Name, + }, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + } +} + +// newLauncherRole creates a new launcher Role for an MPIJob resource. It also +// sets the appropriate OwnerReferences on the resource so handleObject can +// discover the MPIJob resource that 'owns' it. +func newLauncherRole(mpiJob *kubeflow.MPIJob, workerReplicas int) *rbacv1.Role { + var podNames []string + for i := 0; i < workerReplicas; i++ { + podNames = append(podNames, fmt.Sprintf("%s%s-%d", mpiJob.Name, workerSuffix, i)) + } + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: mpiJob.Name + launcherSuffix, + Namespace: mpiJob.Namespace, + Labels: map[string]string{ + "app": mpiJob.Name, + }, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + Rules: []rbacv1.PolicyRule{ + { + Verbs: []string{"get"}, + APIGroups: []string{""}, + Resources: []string{"pods"}, + ResourceNames: podNames, + }, + { + Verbs: []string{"create"}, + APIGroups: []string{""}, + Resources: []string{"pods/exec"}, + ResourceNames: podNames, + }, + }, + } +} + +// newLauncherRoleBinding creates a new launcher RoleBinding for an MPIJob +// resource. It also sets the appropriate OwnerReferences on the resource so +// handleObject can discover the MPIJob resource that 'owns' it. +func newLauncherRoleBinding(mpiJob *kubeflow.MPIJob) *rbacv1.RoleBinding { + launcherName := mpiJob.Name + launcherSuffix + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: launcherName, + Namespace: mpiJob.Namespace, + Labels: map[string]string{ + "app": mpiJob.Name, + }, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + Subjects: []rbacv1.Subject{ + { + Kind: rbacv1.ServiceAccountKind, + Name: launcherName, + Namespace: mpiJob.Namespace, + }, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "Role", + Name: launcherName, + }, + } +} + +// newWorker creates a new worker StatefulSet for an MPIJob resource. It also +// sets the appropriate OwnerReferences on the resource so handleObject can +// discover the MPIJob resource that 'owns' it. +func newWorker(mpiJob *kubeflow.MPIJob, desiredReplicas int32, gpus int) *appsv1.StatefulSet { + labels := map[string]string{ + "app": mpiJob.Name + workerSuffix, + } + + podSpec := mpiJob.Spec.Template.DeepCopy() + podSpec.Labels = labels + + container := podSpec.Spec.Containers[0] + container.Command = []string{"sleep"} + container.Args = []string{"365d"} + if container.Resources.Limits == nil { + container.Resources.Limits = make(corev1.ResourceList) + } + container.Resources.Limits[gpuResourceName] = *resource.NewQuantity(int64(gpus), resource.DecimalExponent) + + // We need the kubexec.sh script here because Open MPI checks for the path + // in every rank. + container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{ + Name: configVolumeName, + MountPath: configMountPath, + }) + podSpec.Spec.Containers[0] = container + + scriptMode := int32(0555) + podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, corev1.Volume{ + Name: configVolumeName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mpiJob.Name + configSuffix, + }, + Items: []corev1.KeyToPath{ + { + Key: kubexecScriptName, + Path: kubexecScriptName, + Mode: &scriptMode, + }, + }, + }, + }, + }) + + return &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: mpiJob.Name + workerSuffix, + Namespace: mpiJob.Namespace, + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + Spec: appsv1.StatefulSetSpec{ + PodManagementPolicy: appsv1.ParallelPodManagement, + Replicas: &desiredReplicas, + Selector: &metav1.LabelSelector{ + MatchLabels: labels, + }, + ServiceName: mpiJob.Name + workerSuffix, + Template: *podSpec, + }, + } +} + +// newLauncher creates a new launcher Job for an MPIJob resource. It also sets +// the appropriate OwnerReferences on the resource so handleObject can discover +// the MPIJob resource that 'owns' it. +func newLauncher(mpiJob *kubeflow.MPIJob, gpus int, kubectlDeliveryImage string) *batchv1.Job { + launcherName := mpiJob.Name + launcherSuffix + labels := map[string]string{ + "app": launcherName, + } + + podSpec := mpiJob.Spec.Template.DeepCopy() + podSpec.Labels = labels + podSpec.Spec.ServiceAccountName = launcherName + podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ + Name: kubectlDeliveryName, + Image: kubectlDeliveryImage, + Env: []corev1.EnvVar{ + { + Name: kubectlTargetDirEnv, + Value: kubectlMountPath, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: kubectlVolumeName, + MountPath: kubectlMountPath, + }, + }, + }) + container := podSpec.Spec.Containers[0] + container.Env = append(container.Env, + corev1.EnvVar{ + Name: "OMPI_MCA_plm_rsh_agent", + Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), + }, + corev1.EnvVar{ + Name: "OMPI_MCA_orte_default_hostfile", + Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName), + }) + if container.Resources.Limits == nil { + container.Resources.Limits = make(corev1.ResourceList) + } + container.Resources.Limits[gpuResourceName] = *resource.NewQuantity(int64(gpus), resource.DecimalExponent) + container.VolumeMounts = append(container.VolumeMounts, + corev1.VolumeMount{ + Name: kubectlVolumeName, + MountPath: kubectlMountPath, + }, + corev1.VolumeMount{ + Name: configVolumeName, + MountPath: configMountPath, + }) + podSpec.Spec.Containers[0] = container + podSpec.Spec.RestartPolicy = corev1.RestartPolicyOnFailure + scriptsMode := int32(0555) + hostfileMode := int32(0444) + podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, + corev1.Volume{ + Name: kubectlVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + corev1.Volume{ + Name: configVolumeName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mpiJob.Name + configSuffix, + }, + Items: []corev1.KeyToPath{ + { + Key: kubexecScriptName, + Path: kubexecScriptName, + Mode: &scriptsMode, + }, + { + Key: hostfileName, + Path: hostfileName, + Mode: &hostfileMode, + }, + }, + }, + }, + }) + + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: launcherName, + Namespace: mpiJob.Namespace, + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(mpiJob, kubeflow.SchemeGroupVersionKind), + }, + }, + Spec: batchv1.JobSpec{ + Template: *podSpec, + }, + } +} diff --git a/pkg/controllers/mpi_job_controller_test.go b/pkg/controllers/mpi_job_controller_test.go new file mode 100644 index 0000000..711b161 --- /dev/null +++ b/pkg/controllers/mpi_job_controller_test.go @@ -0,0 +1,648 @@ +// Copyright 2018 The Kubeflow Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + "reflect" + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/diff" + kubeinformers "k8s.io/client-go/informers" + k8sfake "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v1alpha1" + "github.com/kubeflow/mpi-operator/pkg/client/clientset/versioned/fake" + informers "github.com/kubeflow/mpi-operator/pkg/client/informers/externalversions" +) + +var ( + alwaysReady = func() bool { return true } + noResyncPeriodFunc = func() time.Duration { return 0 } +) + +type fixture struct { + t *testing.T + + client *fake.Clientset + kubeClient *k8sfake.Clientset + + // Objects to put in the store. + configMapLister []*corev1.ConfigMap + serviceAccountLister []*corev1.ServiceAccount + roleLister []*rbacv1.Role + roleBindingLister []*rbacv1.RoleBinding + statefulSetLister []*appsv1.StatefulSet + jobLister []*batchv1.Job + mpiJobLister []*kubeflow.MPIJob + + // Actions expected to happen on the client. + kubeActions []core.Action + actions []core.Action + + // Objects from here are pre-loaded into NewSimpleFake. + kubeObjects []runtime.Object + objects []runtime.Object +} + +func newFixture(t *testing.T) *fixture { + f := &fixture{} + f.t = t + f.objects = []runtime.Object{} + f.kubeObjects = []runtime.Object{} + return f +} + +func newMPIJob(name string, gpus *int32) *kubeflow.MPIJob { + return &kubeflow.MPIJob{ + TypeMeta: metav1.TypeMeta{APIVersion: kubeflow.SchemeGroupVersion.String()}, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: metav1.NamespaceDefault, + }, + Spec: kubeflow.MPIJobSpec{ + GPUs: gpus, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "foo", + Image: "bar", + }, + }, + }, + }, + }, + } +} + +func (f *fixture) newController() (*MPIJobController, informers.SharedInformerFactory, kubeinformers.SharedInformerFactory) { + f.client = fake.NewSimpleClientset(f.objects...) + f.kubeClient = k8sfake.NewSimpleClientset(f.kubeObjects...) + + i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc()) + k8sI := kubeinformers.NewSharedInformerFactory(f.kubeClient, noResyncPeriodFunc()) + + c := NewMPIJobController( + f.kubeClient, + f.client, + k8sI.Core().V1().ConfigMaps(), + k8sI.Core().V1().ServiceAccounts(), + k8sI.Rbac().V1().Roles(), + k8sI.Rbac().V1().RoleBindings(), + k8sI.Apps().V1().StatefulSets(), + k8sI.Batch().V1().Jobs(), + i.Kubeflow().V1alpha1().MPIJobs(), + 8, + "kubectl-delivery") + + c.configMapSynced = alwaysReady + c.serviceAccountSynced = alwaysReady + c.roleSynced = alwaysReady + c.roleBindingSynced = alwaysReady + c.statefulSetSynced = alwaysReady + c.jobSynced = alwaysReady + c.mpiJobSynced = alwaysReady + c.recorder = &record.FakeRecorder{} + + for _, configMap := range f.configMapLister { + k8sI.Core().V1().ConfigMaps().Informer().GetIndexer().Add(configMap) + } + + for _, serviceAccount := range f.serviceAccountLister { + k8sI.Core().V1().ServiceAccounts().Informer().GetIndexer().Add(serviceAccount) + } + + for _, role := range f.roleLister { + k8sI.Rbac().V1().Roles().Informer().GetIndexer().Add(role) + } + + for _, roleBinding := range f.roleBindingLister { + k8sI.Rbac().V1().RoleBindings().Informer().GetIndexer().Add(roleBinding) + } + + for _, statefulSet := range f.statefulSetLister { + k8sI.Apps().V1().StatefulSets().Informer().GetIndexer().Add(statefulSet) + } + + for _, job := range f.jobLister { + k8sI.Batch().V1().Jobs().Informer().GetIndexer().Add(job) + } + + for _, mpiJob := range f.mpiJobLister { + i.Kubeflow().V1alpha1().MPIJobs().Informer().GetIndexer().Add(mpiJob) + } + + return c, i, k8sI +} + +func (f *fixture) run(mpiJobName string) { + f.runController(mpiJobName, true, false) +} + +func (f *fixture) runExpectError(mpiJobName string) { + f.runController(mpiJobName, true, true) +} + +func (f *fixture) runController(mpiJobName string, startInformers bool, expectError bool) { + c, i, k8sI := f.newController() + if startInformers { + stopCh := make(chan struct{}) + defer close(stopCh) + i.Start(stopCh) + k8sI.Start(stopCh) + } + + err := c.syncHandler(mpiJobName) + if !expectError && err != nil { + f.t.Errorf("error syncing mpi job: %v", err) + } else if expectError && err == nil { + f.t.Error("expected error syncing mpi job, got nil") + } + + actions := filterInformerActions(f.client.Actions()) + for i, action := range actions { + if len(f.actions) < i+1 { + f.t.Errorf("%d unexpected actions: %+v", len(actions)-len(f.actions), actions[i:]) + break + } + + expectedAction := f.actions[i] + checkAction(expectedAction, action, f.t) + } + + if len(f.actions) > len(actions) { + f.t.Errorf("%d additional expected actions:%+v", len(f.actions)-len(actions), f.actions[len(actions):]) + } + + k8sActions := filterInformerActions(f.kubeClient.Actions()) + for i, action := range k8sActions { + if len(f.kubeActions) < i+1 { + f.t.Errorf("%d unexpected actions: %+v", len(k8sActions)-len(f.kubeActions), k8sActions[i:]) + break + } + + expectedAction := f.kubeActions[i] + checkAction(expectedAction, action, f.t) + } + + if len(f.kubeActions) > len(k8sActions) { + f.t.Errorf("%d additional expected actions:%+v", len(f.kubeActions)-len(k8sActions), f.kubeActions[len(k8sActions):]) + } +} + +// checkAction verifies that expected and actual actions are equal and both have +// same attached resources +func checkAction(expected, actual core.Action, t *testing.T) { + if !(expected.Matches(actual.GetVerb(), actual.GetResource().Resource) && actual.GetSubresource() == expected.GetSubresource()) { + t.Errorf("Expected\n\t%#v\ngot\n\t%#v", expected, actual) + return + } + + if reflect.TypeOf(actual) != reflect.TypeOf(expected) { + t.Errorf("Action has wrong type. Expected: %t. Got: %t", expected, actual) + return + } + + switch a := actual.(type) { + case core.CreateAction: + e, _ := expected.(core.CreateAction) + expObject := e.GetObject() + object := a.GetObject() + + if !reflect.DeepEqual(expObject, object) { + t.Errorf("Action %s %s has wrong object\nDiff:\n %s", + a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object)) + } + case core.UpdateAction: + e, _ := expected.(core.UpdateAction) + expObject := e.GetObject() + object := a.GetObject() + + if !reflect.DeepEqual(expObject, object) { + t.Errorf("Action %s %s has wrong object\nDiff:\n %s", + a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expObject, object)) + } + case core.PatchAction: + e, _ := expected.(core.PatchAction) + expPatch := e.GetPatch() + patch := a.GetPatch() + + if !reflect.DeepEqual(expPatch, expPatch) { + t.Errorf("Action %s %s has wrong patch\nDiff:\n %s", + a.GetVerb(), a.GetResource().Resource, diff.ObjectGoPrintDiff(expPatch, patch)) + } + } +} + +// filterInformerActions filters list and watch actions for testing resources. +// Since list and watch don't change resource state we can filter it to lower +// nose level in our tests. +func filterInformerActions(actions []core.Action) []core.Action { + var ret []core.Action + for _, action := range actions { + if len(action.GetNamespace()) == 0 && + (action.Matches("list", "configmaps") || + action.Matches("watch", "configmaps") || + action.Matches("list", "serviceaccounts") || + action.Matches("watch", "serviceaccounts") || + action.Matches("list", "roles") || + action.Matches("watch", "roles") || + action.Matches("list", "rolebindings") || + action.Matches("watch", "rolebindings") || + action.Matches("list", "statefulsets") || + action.Matches("watch", "statefulsets") || + action.Matches("list", "pods") || + action.Matches("watch", "pods") || + action.Matches("list", "jobs") || + action.Matches("watch", "jobs") || + action.Matches("list", "mpijobs") || + action.Matches("watch", "mpijobs")) { + continue + } + ret = append(ret, action) + } + + return ret +} + +func (f *fixture) expectCreateConfigMapAction(d *corev1.ConfigMap) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "configmaps"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateConfigMapAction(d *corev1.ConfigMap) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "configmaps"}, d.Namespace, d)) +} + +func (f *fixture) expectCreateServiceAccountAction(d *corev1.ServiceAccount) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "serviceaccounts"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateServiceAccountAction(d *corev1.ServiceAccount) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "serviceaccounts"}, d.Namespace, d)) +} + +func (f *fixture) expectCreateRoleAction(d *rbacv1.Role) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "roles"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateRoleAction(d *rbacv1.Role) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "roles"}, d.Namespace, d)) +} + +func (f *fixture) expectCreateRoleBindingAction(d *rbacv1.RoleBinding) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "rolebindings"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateRoleBindingAction(d *rbacv1.RoleBinding) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "rolebindings"}, d.Namespace, d)) +} + +func (f *fixture) expectCreateStatefulSetAction(d *appsv1.StatefulSet) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "statefulsets"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateStatefulSetAction(d *appsv1.StatefulSet) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "statefulsets"}, d.Namespace, d)) +} + +func (f *fixture) expectCreateJobAction(d *batchv1.Job) { + f.kubeActions = append(f.kubeActions, core.NewCreateAction(schema.GroupVersionResource{Resource: "jobs"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateJobAction(d *batchv1.Job) { + f.kubeActions = append(f.kubeActions, core.NewUpdateAction(schema.GroupVersionResource{Resource: "jobs"}, d.Namespace, d)) +} + +func (f *fixture) expectUpdateMPIJobStatusAction(mpiJob *kubeflow.MPIJob) { + action := core.NewUpdateAction(schema.GroupVersionResource{Resource: "mpijobs"}, mpiJob.Namespace, mpiJob) + // TODO: Until #38113 is merged, we can't use Subresource + //action.Subresource = "status" + f.actions = append(f.actions, action) +} + +func (f *fixture) setUpMPIJob(mpiJob *kubeflow.MPIJob) { + f.mpiJobLister = append(f.mpiJobLister, mpiJob) + f.objects = append(f.objects, mpiJob) +} + +func (f *fixture) setUpLauncher(launcher *batchv1.Job) { + f.jobLister = append(f.jobLister, launcher) + f.kubeObjects = append(f.kubeObjects, launcher) +} + +func (f *fixture) setUpWorker(worker *appsv1.StatefulSet) { + f.statefulSetLister = append(f.statefulSetLister, worker) + f.kubeObjects = append(f.kubeObjects, worker) +} + +func (f *fixture) setUpConfigMap(configMap *corev1.ConfigMap) { + f.configMapLister = append(f.configMapLister, configMap) + f.kubeObjects = append(f.kubeObjects, configMap) +} + +func (f *fixture) setUpServiceAccount(serviceAccount *corev1.ServiceAccount) { + f.serviceAccountLister = append(f.serviceAccountLister, serviceAccount) + f.kubeObjects = append(f.kubeObjects, serviceAccount) +} + +func (f *fixture) setUpRole(role *rbacv1.Role) { + f.roleLister = append(f.roleLister, role) + f.kubeObjects = append(f.kubeObjects, role) +} + +func (f *fixture) setUpRoleBinding(roleBinding *rbacv1.RoleBinding) { + f.roleBindingLister = append(f.roleBindingLister, roleBinding) + f.kubeObjects = append(f.kubeObjects, roleBinding) +} + +func (f *fixture) setUpRbac(mpiJob *kubeflow.MPIJob, workerReplicas int) { + serviceAccount := newLauncherServiceAccount(mpiJob) + f.setUpServiceAccount(serviceAccount) + + role := newLauncherRole(mpiJob, workerReplicas) + f.setUpRole(role) + + roleBinding := newLauncherRoleBinding(mpiJob) + f.setUpRoleBinding(roleBinding) +} + +func getKey(mpiJob *kubeflow.MPIJob, t *testing.T) string { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(mpiJob) + if err != nil { + t.Errorf("Unexpected error getting key for mpi job %v: %v", mpiJob.Name, err) + return "" + } + return key +} + +func TestDoNothingWithInvalidKey(t *testing.T) { + f := newFixture(t) + f.run("foo/bar/baz") +} + +func TestDoNothingWithNonexistentMPIJob(t *testing.T) { + f := newFixture(t) + mpiJob := newMPIJob("test", int32Ptr(64)) + f.run(getKey(mpiJob, t)) +} + +func TestLauncherNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + launcher := newLauncher(mpiJob, 64, "kubectl-delivery") + launcher.OwnerReferences = nil + f.setUpLauncher(launcher) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestLauncherSucceeded(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + launcher := newLauncher(mpiJob, 64, "kubectl-delivery") + launcher.Status.Succeeded = 1 + f.setUpLauncher(launcher) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherSucceeded + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func TestLauncherFailed(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + launcher := newLauncher(mpiJob, 64, "kubectl-delivery") + launcher.Status.Failed = 1 + f.setUpLauncher(launcher) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherFailed + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func TestLauncherDoesNotExist(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + expConfigMap := newConfigMap(mpiJob, 7, 8) + f.expectCreateConfigMapAction(expConfigMap) + + expServiceAccount := newLauncherServiceAccount(mpiJob) + f.expectCreateServiceAccountAction(expServiceAccount) + + expRole := newLauncherRole(mpiJob, 7) + f.expectCreateRoleAction(expRole) + + expRoleBinding := newLauncherRoleBinding(mpiJob) + f.expectCreateRoleBindingAction(expRoleBinding) + + expWorker := newWorker(mpiJob, 7, 8) + f.expectCreateStatefulSetAction(expWorker) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.WorkerReplicas = 0 + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func TestConfigMapNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + configMap := newConfigMap(mpiJob, 7, 8) + configMap.OwnerReferences = nil + f.setUpConfigMap(configMap) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestServiceAccountNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 7, 8)) + + serviceAccount := newLauncherServiceAccount(mpiJob) + serviceAccount.OwnerReferences = nil + f.setUpServiceAccount(serviceAccount) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestRoleNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 7, 8)) + f.setUpServiceAccount(newLauncherServiceAccount(mpiJob)) + + role := newLauncherRole(mpiJob, 7) + role.OwnerReferences = nil + f.setUpRole(role) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestRoleBindingNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 7, 8)) + f.setUpServiceAccount(newLauncherServiceAccount(mpiJob)) + f.setUpRole(newLauncherRole(mpiJob, 7)) + + roleBinding := newLauncherRoleBinding(mpiJob) + roleBinding.OwnerReferences = nil + f.setUpRoleBinding(roleBinding) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestShutdownWorker(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + launcher := newLauncher(mpiJob, 64, "kubectl-delivery") + launcher.Status.Succeeded = 1 + f.setUpLauncher(launcher) + + worker := newWorker(mpiJob, 7, 8) + f.setUpWorker(worker) + + expWorker := newWorker(mpiJob, 0, 8) + f.expectUpdateStatefulSetAction(expWorker) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.WorkerReplicas = 0 + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherSucceeded + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func TestWorkerNotControlledByUs(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(64)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 7, 8)) + f.setUpRbac(mpiJob, 7) + + worker := newWorker(mpiJob, 7, 8) + worker.OwnerReferences = nil + f.setUpWorker(worker) + + f.runExpectError(getKey(mpiJob, t)) +} + +func TestWorkerNotNeeded(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(8)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 0, 8)) + f.setUpRbac(mpiJob, 0) + + expLauncher := newLauncher(mpiJob, 8, "kubectl-delivery") + f.expectCreateJobAction(expLauncher) + + f.expectUpdateMPIJobStatusAction(mpiJob) + + f.run(getKey(mpiJob, t)) +} + +func TestLauncherActive(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(8)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 0, 8)) + f.setUpRbac(mpiJob, 0) + + launcher := newLauncher(mpiJob, 64, "kubectl-delivery") + launcher.Status.Active = 1 + f.setUpLauncher(launcher) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.LauncherStatus = kubeflow.LauncherActive + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func TestWorkerReady(t *testing.T) { + f := newFixture(t) + + mpiJob := newMPIJob("test", int32Ptr(16)) + f.setUpMPIJob(mpiJob) + + f.setUpConfigMap(newConfigMap(mpiJob, 1, 8)) + f.setUpRbac(mpiJob, 1) + + worker := newWorker(mpiJob, 1, 8) + worker.Status.ReadyReplicas = 1 + f.setUpWorker(worker) + + expLauncher := newLauncher(mpiJob, 8, "kubectl-delivery") + f.expectCreateJobAction(expLauncher) + + mpiJobCopy := mpiJob.DeepCopy() + mpiJobCopy.Status.WorkerReplicas = 1 + f.expectUpdateMPIJobStatusAction(mpiJobCopy) + + f.run(getKey(mpiJob, t)) +} + +func int32Ptr(i int32) *int32 { return &i }