Continuously build the docker images used by mnist. (#462)

* This is the first step in adding E2E tests for the mnist example. * Add a Makefile and .jsonnet file to build the Docker images using GCB * Define an Argo workflow to trigger the image builds on pre & post submit. Related to: #460
2019-01-08 15:21:49 -08:00 · 2019-01-08 15:21:49 -08:00 · d28ba7c4db
parent 1cc4550b7d
commit d28ba7c4db
6 changed files with 498 additions and 0 deletions
--- a/mnist/.gitignore
+++ b/mnist/.gitignore
@ -0,0 +1 @@
+build/**
--- a/mnist/Makefile
+++ b/mnist/Makefile
@ -0,0 +1,63 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Requirements:
+#   https://github.com/mattrobenolt/jinja2-cli
+#   pip install jinja2-clie
+#
+# To override variables do
+# make ${TARGET} ${VAR}=${VALUE}
+#
+
+# IMG is the base path for images..
+# Individual images will be
+# $(IMG)/$(NAME):$(TAG)
+IMG ?= gcr.io/kubeflow-examples/mnist
+
+# List any changed  files. We only include files in the notebooks directory.
+# because that is the code in the docker image.
+# In particular we exclude changes to the ksonnet configs.
+CHANGED_FILES := $(shell git diff-files --relative=mnist/)
+
+# Whether to use cached images with GCB
+USE_IMAGE_CACHE ?= true
+
+ifeq ($(strip $(CHANGED_FILES)),)
+# Changed files is empty; not dirty
+# Don't include --dirty because it could be dirty if files outside the ones we care
+# about changed.
+GIT_VERSION := $(shell git describe --always)
+else
+GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
+endif
+
+TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
+all: build
+
+# Build the GCB workflow
+build-gcb-spec:
+	rm -rf ./build
+	mkdir  -p build
+	jsonnet ./image_build.jsonnet --ext-str imageBase=$(IMG) \
+	  --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
+	  --ext-str useImageCache=$(USE_IMAGE_CACHE) \
+	  > ./build/image_build.json
+
+# Build using GCB. This is useful if we are on a slow internet connection
+# and don't want to pull images locally.
+# Its also used to build from our CI system.
+build-gcb: build-gcb-spec		
+	gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci \
+	    --config=./build/image_build.json \
+		--timeout=3600 .
--- a/mnist/image_build.jsonnet
+++ b/mnist/image_build.jsonnet
@ -0,0 +1,93 @@
+// TODO(jlewi): We should tag the image latest and then
+// use latest as a cache so that rebuilds are fast
+// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
+{
+
+  // Convert non-boolean types like string,number to a boolean.
+  // This is primarily intended for dealing with parameters that should be booleans.
+  local toBool = function(x) {
+    result::
+      if std.type(x) == "boolean" then
+        x
+      else if std.type(x) == "string" then
+        std.asciiUpper(x) == "TRUE"
+      else if std.type(x) == "number" then
+        x != 0
+      else
+        false,
+  }.result,
+
+  local useImageCache = toBool(std.extVar("useImageCache")),
+
+  // A tempalte for defining the steps for building each image.
+  //
+  // TODO(jlewi): This logic is reused across a lot of examples; can we put in a shared
+  // location and just import it?
+  local subGraphTemplate = {
+    // following variables must be set
+    name: null,
+
+    dockerFile: null,
+    buildArg: null,
+    contextDir: ".",
+
+    local template = self,
+
+    local pullStep = if useImageCache then [
+      {
+        id: "pull-" + template.name,
+        name: "gcr.io/cloud-builders/docker",
+        args: ["pull", std.extVar("imageBase") + "/" + template.name + ":latest"],
+        waitFor: ["-"],
+      },
+    ] else [],
+
+    local image = std.extVar("imageBase") + "/" + template.name + ":" + std.extVar("tag"),
+    local imageLatest = std.extVar("imageBase") + "/" + template.name + ":latest",
+
+    images: [image, imageLatest],
+    steps: pullStep +
+           [
+             {
+               local buildArgList = if template.buildArg != null then ["--build-arg", template.buildArg] else [],
+               local cacheList = if useImageCache then ["--cache-from=" + imageLatest] else [],
+
+               id: "build-" + template.name,
+               name: "gcr.io/cloud-builders/docker",
+               args: [
+                       "build",
+                       "-t",
+                       image,
+                       "--label=git-versions=" + std.extVar("gitVersion"),
+                     ]
+                     + buildArgList
+                     + [
+                       "--file=" + template.dockerFile,
+                     ]
+                     + cacheList + [template.contextDir],
+               waitFor: if useImageCache then ["pull-" + template.name] else ["-"],
+             },
+             {
+               id: "tag-" + template.name,
+               name: "gcr.io/cloud-builders/docker",
+               args: ["tag", image, imageLatest],
+               waitFor: ["build-" + template.name],
+             },
+           ],
+  },
+
+  local modelSteps = subGraphTemplate {
+    name: "model",
+    dockerFile: "./Dockerfile.model",
+    contextDir: "."
+  },
+
+  local ksonnetSteps = subGraphTemplate {
+    name: "ksonnet",
+    dockerFile: "./Dockerfile.ksonnet",
+    contextDir: "."
+  },
+
+  steps: modelSteps.steps + ksonnetSteps.steps,
+  images: modelSteps.images + ksonnetSteps.images,
+}
--- a/prow_config.yaml
+++ b/prow_config.yaml
@ -17,6 +17,16 @@ workflows:
    include_dirs:
    - code_search/*
  
+  # E2E test for mnist example
+  - app_dir: kubeflow/examples/test/workflows
+    component: mnist
+    name: mnist
+    job_types:
+    - presubmit
+    - postsubmit
+    include_dirs:
+    - mnist/*
+
  # E2E test for github issue summarization example
  - app_dir: kubeflow/examples/test/workflows
    component: gis
--- a/test/workflows/components/mnist.jsonnet
+++ b/test/workflows/components/mnist.jsonnet
@ -0,0 +1,325 @@
+// Test workflow for GitHub Issue Summarization.
+//
+local env = std.extVar("__ksonnet/environments");
+local overrides = std.extVar("__ksonnet/params").components.mnist;
+
+local k = import "k.libsonnet";
+local util = import "util.libsonnet";
+
+// Define default params and then combine them with any overrides
+local defaultParams = {
+  // local nfsVolumeClaim: "kubeflow-testing",
+  nfsVolumeClaim: "nfs-external",
+
+  // The name to use for the volume to use to contain test data.
+  dataVolume: "kubeflow-test-volume",
+
+  // Default step image:
+  stepImage: "gcr.io/kubeflow-ci/test-worker:v20181017-bfeaaf5-dirty-4adcd0",
+};
+
+local params = defaultParams + overrides;
+
+local prowEnv = util.parseEnv(params.prow_env);
+
+// Create a dictionary of the different prow variables so we can refer to them in the workflow.
+//
+// Important: We want to initialize all variables we reference to some value. If we don't
+// and we reference a variable which doesn't get set then we get very hard to debug failure messages.
+// In particular, we've seen problems where if we add a new environment and evaluate one component eg. "workflows"
+// and another component e.g "code_search.jsonnet" doesn't have a default value for BUILD_ID then ksonnet
+// fails because BUILD_ID is undefined.
+local prowDict = {
+	BUILD_ID: "notset",
+	BUILD_NUMBER: "notset",
+	REPO_OWNER: "notset",
+	REPO_NAME: "notset",
+	JOB_NAME: "notset",
+	JOB_TYPE: "notset",
+	PULL_NUMBER: "notset",	
+ } + util.listOfDictToMap(prowEnv);
+
+local bucket = params.bucket;
+
+// mountPath is the directory where the volume to store the test data
+// should be mounted.
+local mountPath = "/mnt/" + "test-data-volume";
+// testDir is the root directory for all data for a particular test run.
+local testDir = mountPath + "/" + params.name;
+// outputDir is the directory to sync to GCS to contain the output for this job.
+local outputDir = testDir + "/output";
+local artifactsDir = outputDir + "/artifacts";
+
+// Source directory where all repos should be checked out
+local srcRootDir = testDir + "/src";
+
+// The directory containing the kubeflow/kubeflow repo
+local srcDir = srcRootDir + "/" + prowDict.REPO_OWNER + "/" + prowDict.REPO_NAME;
+
+
+// These variables control where the docker images get pushed and what 
+// tag to use
+local imageBase = "gcr.io/kubeflow-ci/github-issue-summarization";
+local imageTag = "build-" + prowDict["BUILD_ID"];
+
+// Build template is a template for constructing Argo step templates.
+//
+// step_name: Name for the template
+// command: List to pass as the container command.
+//
+// We customize the defaults for each step in the workflow by modifying
+// buildTemplate.argoTemplate
+local buildTemplate = {
+  // name & command variables should be overwritten for every test.
+  // Other variables can be changed per step as needed.
+  // They are hidden because they shouldn't be included in the Argo template
+  name: "",
+  command:: "",
+  image: params.stepImage,
+  workingDir:: null,
+  env_vars:: [],
+  side_cars: [],
+
+
+  activeDeadlineSeconds: 1800,  // Set 30 minute timeout for each template
+
+  local template = self,
+
+  // The directory within the kubeflow_testing submodule containing
+  // py scripts to use.
+  local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py",
+
+  // Actual template for Argo
+  argoTemplate: {
+    name: template.name,
+    container: {
+      command: template.command,
+      name: template.name,
+      image: template.image,
+      workingDir: template.workingDir,
+      env: [
+        {
+          // Add the source directories to the python path.
+          name: "PYTHONPATH",
+          value: kubeflowTestingPy,
+        },
+        {
+          name: "GOOGLE_APPLICATION_CREDENTIALS",
+          value: "/secret/gcp-credentials/key.json",
+        },
+        {
+          name: "GITHUB_TOKEN",
+          valueFrom: {
+            secretKeyRef: {
+              name: "github-token",
+              key: "github_token",
+            },
+          },
+        },
+      ] + prowEnv + template.env_vars,
+      volumeMounts: [
+        {
+          name: params.dataVolume,
+          mountPath: mountPath,
+        },
+        {
+          name: "github-token",
+          mountPath: "/secret/github-token",
+        },
+        {
+          name: "gcp-credentials",
+          mountPath: "/secret/gcp-credentials",
+        },
+      ],
+    },
+  },
+};  // buildTemplate
+
+
+// Create a list of dictionary.
+// Each item is a dictionary describing one step in the graph.
+local dagTemplates = [
+  {
+    template: buildTemplate {
+      name: "checkout",
+      command:
+        ["/usr/local/bin/checkout.sh", srcRootDir],
+
+      env_vars: [{
+        name: "EXTRA_REPOS",
+        value: "kubeflow/testing@HEAD",
+      }],
+    },
+    dependencies: null,
+  },  // checkout
+  {
+    // TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
+    // should be done by run_e2e_workflow.py
+    template: buildTemplate {
+      name: "create-pr-symlink",
+      command: [
+        "python",
+        "-m",
+        "kubeflow.testing.prow_artifacts",
+        "--artifacts_dir=" + outputDir,
+        "create_pr_symlink",
+        "--bucket=" + params.bucket,
+      ],
+    },  // create-pr-symlink
+    dependencies: ["checkout"],
+  },  // create-pr-symlink
+  {
+    // Submit a GCB job to build the images
+    template: buildTemplate {
+      name: "build-images",
+      command: util.buildCommand([
+      [
+        "gcloud",
+        "auth",
+        "activate-service-account",
+        "--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
+      ],
+      	[
+        "make",
+        "build-gcb",
+        "IMG=" + imageBase,
+        "TAG=" + imageTag,
+      ]]
+      ),
+      workingDir: srcDir + "/github_issue_summarization",      
+    },
+    dependencies: ["checkout"],
+  }, // build-images
+  {
+    // Run the python test to train the model
+    template: buildTemplate {
+      name: "train-test",
+      command: [
+        "python",
+        "train_test.py",
+      ],
+      // Use the newly built image.
+      image: imageBase + "/trainer-estimator:" + imageTag,
+      workingDir: "/issues",
+    },
+    dependencies: ["build-images"],
+  },  // train-test
+];
+
+// Dag defines the tasks in the graph
+local dag = {
+  name: "e2e",
+  // Construct tasks from the templates
+  // we will give the steps the same name as the template
+  dag: {
+    tasks: util.toArgoTaskList(dagTemplates),
+  },
+};  // dag
+
+// Define templates for the steps to be performed when the
+// test exits
+local exitTemplates =
+  [
+    {
+      // Copy artifacts to GCS for gubernator.
+      // TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
+      // should be done by run_e2e_workflow.py
+      template: buildTemplate {
+        name: "copy-artifacts",
+        command: [
+          "python",
+          "-m",
+          "kubeflow.testing.prow_artifacts",
+          "--artifacts_dir=" + outputDir,
+          "copy_artifacts",
+          "--bucket=" + bucket,
+        ],
+      },  // copy-artifacts,
+
+    },
+    {
+      // Delete the test directory in NFS.
+      // TODO(https://github.com/kubeflow/testing/issues/256): Use an external process to do this.
+      template:
+        buildTemplate {
+          name: "test-dir-delete",
+          command: [           
+            "rm",
+            "-rf",
+            testDir,
+          ],
+
+          argoTemplate+: {
+        	  retryStrategy: {
+        	  	limit: 3,
+        	  },
+          },
+        },  // test-dir-delete
+      dependencies: ["copy-artifacts"],
+    },
+  ];
+
+// Create a DAG representing the set of steps to execute on exit
+local exitDag = {
+  name: "exit-handler",
+  // Construct tasks from the templates
+  // we will give the steps the same name as the template
+  dag: {    
+    tasks: util.toArgoTaskList(exitTemplates),    
+  },
+};
+
+// A list of templates for the actual steps
+local stepTemplates = std.map(function(i) i.template.argoTemplate
+                              , dagTemplates) +
+                      std.map(function(i) i.template.argoTemplate
+                              , exitTemplates);
+
+// Define the Argo Workflow.
+local workflow = {
+  apiVersion: "argoproj.io/v1alpha1",
+  kind: "Workflow",
+  metadata: {
+    name: params.name,
+    namespace: env.namespace,
+    labels: {
+      org: prowDict.REPO_OWNER,
+      repo: prowDict.REPO_NAME,
+      workflow: "gis",
+      [if std.objectHas(prowDict, "PULL_NUMBER") then "pr"]: prowDict.PULL_NUMBER,
+    },
+  },
+  spec: {
+    entrypoint: "e2e",
+    volumes: [
+      {
+        name: "github-token",
+        secret: {
+          secretName: "github-token",
+        },
+      },
+      {
+        name: "gcp-credentials",
+        secret: {
+          secretName: "kubeflow-testing-credentials",
+        },
+      },
+      {
+        name: params.dataVolume,
+        persistentVolumeClaim: {
+          claimName: params.nfsVolumeClaim,
+        },
+      },
+    ],  // volumes
+
+    // onExit specifies the template that should always run when the workflow completes.
+    onExit: "exit-handler",
+
+    // The templates will be a combination of the templates
+    // defining the dags executed by Argo as well as the templates
+    // for the individual steps.
+    templates: [dag, exitDag] + stepTemplates,  // templates
+  },  // spec
+};  // workflow
+
+std.prune(k.core.v1.list.new([workflow]))
--- a/test/workflows/components/params.libsonnet
+++ b/test/workflows/components/params.libsonnet
@ -18,6 +18,12 @@
      namespace: "kubeflow-test-infra",
      prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
    },
+    mnist: {
+      bucket: "kubeflow-ci_temp",
+      name: "kubeflow-mnist",
+      namespace: "kubeflow-test-infra",
+      prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
+    },
    workflows: {
      bucket: "kubeflow-ci_temp",
      name: "kubeflow-examples-presubmit-test-374-6e32",