add testing for xgboost_synthetic (#633)

This commit is contained in:
Jin Chi He 2019-09-17 06:28:24 +08:00 committed by Kubernetes Prow Robot
parent e37a9d7acd
commit 4f8cf87d4f
13 changed files with 735 additions and 2 deletions

View File

@ -61,3 +61,14 @@ workflows:
- postsubmit
include_dirs:
- pytorch_mnist/*
# E2E test for xgboost-synthetic
- app_dir: kubeflow/examples/test/workflows
component: xgboost_synthetic
name: xgboost2
job_types:
- periodic
- presubmit
- postsubmit
include_dirs:
- xgboost_synthetic/*

View File

@ -36,6 +36,12 @@
namespace: "kubeflow-test-infra",
prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
},
xgboost_synthetic: {
bucket: "kubeflow-ci_temp",
name: "kubeflow-xgboost_synthetic",
namespace: "kubeflow-test-infra",
prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
},
workflows: {
bucket: "kubeflow-ci_temp",
name: "kubeflow-examples-presubmit-test-374-6e32",

View File

@ -0,0 +1,439 @@
// Test workflow for XGBoost Housing example.
//
local env = std.extVar("__ksonnet/environments");
local overrides = std.extVar("__ksonnet/params").components.xgboost_synthetic;
local k = import "k.libsonnet";
local util = import "util.libsonnet";
// Define default params and then combine them with any overrides
local defaultParams = {
// local nfsVolumeClaim: "kubeflow-testing",
nfsVolumeClaim: "nfs-external",
// The name to use for the volume to use to contain test data.
dataVolume: "kubeflow-test-volume",
// Default step image:
stepImage: "gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4",
// Which Kubeflow cluster to use for running PytorchJobs on.
kfProject: "kubeflow-ci-deployment",
kfZone: "us-east1-b",
kfCluster: "kf-vmaster-n00",
// The bucket where the model should be written
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
modelBucket: "kubeflow-ci_temp",
// Whether to delete the namespace at the end.
// Leaving the namespace around can be useful for debugging.
//
// TODO(jlewi): We should consider running a cronjob to GC namespaces.
// But if we leave namespaces up; then we end up leaving the servers up which
// uses up CPU.
//
deleteNamespace: true,
};
local params = defaultParams + overrides;
local prowEnv = util.parseEnv(params.prow_env);
// Create a dictionary of the different prow variables so we can refer to them in the workflow.
//
// Important: We want to initialize all variables we reference to some value. If we don't
// and we reference a variable which doesn't get set then we get very hard to debug failure messages.
// In particular, we've seen problems where if we add a new environment and evaluate one component eg. "workflows"
// and another component e.g "code_search.jsonnet" doesn't have a default value for BUILD_ID then ksonnet
// fails because BUILD_ID is undefined.
local prowDict = {
BUILD_ID: "notset",
BUILD_NUMBER: "notset",
REPO_OWNER: "notset",
REPO_NAME: "notset",
JOB_NAME: "notset",
JOB_TYPE: "notset",
PULL_NUMBER: "notset",
PULL_BASE_SHA: "notset",
} + util.listOfDictToMap(prowEnv);
local bucket = params.bucket;
// mountPath is the directory where the volume to store the test data
// should be mounted.
local mountPath = "/mnt/" + "test-data-volume";
// testDir is the root directory for all data for a particular test run.
local testDir = mountPath + "/" + params.name;
// outputDir is the directory to sync to GCS to contain the output for this job.
local outputDir = testDir + "/output";
local artifactsDir = outputDir + "/artifacts";
// Source directory where all repos should be checked out
local srcRootDir = testDir + "/src";
// The directory containing the kubeflow/kubeflow repo
local srcDir = srcRootDir + "/" + prowDict.REPO_OWNER + "/" + prowDict.REPO_NAME;
// These variables control where the docker images get pushed and what
// tag to use
local executeImage = "gcr.io/kubeflow-images-public/tensorflow-1.13.1-notebook-cpu:v0.5.0";
// value of KUBECONFIG environment variable. This should be a full path.
local kubeConfig = testDir + "/.kube/kubeconfig";
// Namespace where tests should run
local testNamespace = "xgboost-synthetic-" + prowDict["BUILD_ID"];
// The directory within the kubeflow_testing submodule containing
// py scripts to use.
local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py";
// Workflow template is the name of the workflow template; typically the name of the ks component.
// This is used as a label to make it easy to identify all Argo workflows created from a given
// template.
local workflow_template = "xgboost_synthetic";
// Build template is a template for constructing Argo step templates.
//
// step_name: Name for the template
// command: List to pass as the container command.
//
// We customize the defaults for each step in the workflow by modifying
// buildTemplate.argoTemplate
local buildTemplate = {
// name & command variables should be overwritten for every test.
// Other variables can be changed per step as needed.
// They are hidden because they shouldn't be included in the Argo template
name: "",
command:: "",
image: params.stepImage,
workingDir:: null,
env_vars:: [],
side_cars: [],
pythonPath: kubeflowTestingPy,
activeDeadlineSeconds: 1800, // Set 30 minute timeout for each template
local template = self,
// Actual template for Argo
argoTemplate: {
name: template.name,
metadata: {
labels: prowDict + {
workflow: params.name,
workflow_template: workflow_template,
step_name: template.name,
},
},
container: {
command: template.command,
name: template.name,
image: template.image,
workingDir: template.workingDir,
env: [
{
// Add the source directories to the python path.
name: "PYTHONPATH",
value: template.pythonPath,
},
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/key.json",
},
{
name: "GITHUB_TOKEN",
valueFrom: {
secretKeyRef: {
name: "github-token",
key: "github_token",
},
},
},
{
// We use a directory in our NFS share to store our kube config.
// This way we can configure it on a single step and reuse it on subsequent steps.
name: "KUBECONFIG",
value: kubeConfig,
},
] + prowEnv + template.env_vars,
volumeMounts: [
{
name: params.dataVolume,
mountPath: mountPath,
},
{
name: "github-token",
mountPath: "/secret/github-token",
},
{
name: "gcp-credentials",
mountPath: "/secret/gcp-credentials",
},
],
},
},
}; // buildTemplate
// Create a list of dictionary.
// Each item is a dictionary describing one step in the graph.
local dagTemplates = [
{
template: buildTemplate {
name: "checkout",
command:
["/usr/local/bin/checkout.sh", srcRootDir],
env_vars: [{
name: "EXTRA_REPOS",
// TODO(jlewi): Pin to commit on master when #281 is checked in.
value: "kubeflow/testing@HEAD:281",
}],
},
dependencies: null,
}, // checkout
{
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
// should be done by run_e2e_workflow.py
template: buildTemplate {
name: "create-pr-symlink",
command: [
"python",
"-m",
"kubeflow.testing.prow_artifacts",
"--artifacts_dir=" + outputDir,
"create_pr_symlink",
"--bucket=" + params.bucket,
],
}, // create-pr-symlink
dependencies: ["checkout"],
}, // create-pr-symlink
{
// Configure KUBECONFIG
template: buildTemplate {
name: "get-kubeconfig",
command: util.buildCommand([
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"gcloud",
"--project=" + params.kfProject,
"container",
"clusters",
"get-credentials",
"--zone=" + params.kfZone,
params.kfCluster,
]]
),
},
dependencies: ["checkout"],
}, // get-kubeconfig
{
// Create the namespace
// TODO(jlewi): We should add some sort of retry.
template: buildTemplate {
name: "create-namespace",
command: util.buildCommand([
[
"echo",
"KUBECONFIG=",
"${KUBECONFIG}",
],
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"kubectl",
"config" ,
"current-context",
],
[
"kubectl",
"create",
"namespace",
testNamespace,
],
# Copy the GCP secret from the kubeflow namespace to the test namespace
[
srcDir + "/test/copy_secret.sh",
"kubeflow",
testNamespace,
"user-gcp-sa",
]]
),
},
dependencies: ["get-kubeconfig"],
}, // create-namespace
{
template: buildTemplate {
name: "execute-notebook",
command: [
"python3",
"xgboost_test.py",
"--name=" + "xgboost-test-" + prowDict["BUILD_ID"],
"--namespace=" + testNamespace,
"--image=" + executeImage,
"--jobType=" + prowDict["JOB_TYPE"],
"--pullNumber=" + prowDict["PULL_NUMBER"],
"--pullBaseSHA=" + prowDict["PULL_BASE_SHA"],
"--cluster=" + params.kfCluster,
],
pythonPath: kubeflowTestingPy,
workingDir: srcDir + "/xgboost_synthetic/testing",
},
dependencies: ["create-namespace"],
}, // execute-notebook
];
// Dag defines the tasks in the graph
local dag = {
name: "e2e",
// Construct tasks from the templates
// we will give the steps the same name as the template
dag: {
tasks: util.toArgoTaskList(dagTemplates),
},
}; // dag
// Define templates for the steps to be performed when the
// test exits
local deleteTemplates = if params.deleteNamespace then
[
{
// Delete the namespace
// TODO(jlewi): We should add some sort of retry.
template: buildTemplate {
name: "delete-namespace",
command: util.buildCommand([
[
"gcloud",
"auth",
"activate-service-account",
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
],
[
"kubectl",
"delete",
"namespace",
testNamespace,
]]
),
},
}, // delete-namespace
] else [];
local exitTemplates =
deleteTemplates +
[
{
// Copy artifacts to GCS for gubernator.
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
// should be done by run_e2e_workflow.py
template: buildTemplate {
name: "copy-artifacts",
command: [
"python",
"-m",
"kubeflow.testing.prow_artifacts",
"--artifacts_dir=" + outputDir,
"copy_artifacts",
"--bucket=" + bucket,
],
}, // copy-artifacts,
},
{
// Delete the test directory in NFS.
// TODO(https://github.com/kubeflow/testing/issues/256): Use an external process to do this.
template:
buildTemplate {
name: "test-dir-delete",
command: [
"rm",
"-rf",
testDir,
],
argoTemplate+: {
retryStrategy: {
limit: 3,
},
},
}, // test-dir-delete
dependencies: ["copy-artifacts"] + if params.deleteNamespace then ["delete-namespace"] else [],
},
];
// Create a DAG representing the set of steps to execute on exit
local exitDag = {
name: "exit-handler",
// Construct tasks from the templates
// we will give the steps the same name as the template
dag: {
tasks: util.toArgoTaskList(exitTemplates),
},
};
// A list of templates for the actual steps
local stepTemplates = std.map(function(i) i.template.argoTemplate
, dagTemplates) +
std.map(function(i) i.template.argoTemplate
, exitTemplates);
// Define the Argo Workflow.
local workflow = {
apiVersion: "argoproj.io/v1alpha1",
kind: "Workflow",
metadata: {
name: params.name,
namespace: env.namespace,
labels: prowDict + {
workflow: params.name,
workflow_template: workflow_template,
},
},
spec: {
entrypoint: "e2e",
// Have argo garbage collect old workflows otherwise we overload the API server.
ttlSecondsAfterFinished: 7 * 24 * 60 * 60,
volumes: [
{
name: "github-token",
secret: {
secretName: "github-token",
},
},
{
name: "gcp-credentials",
secret: {
secretName: "kubeflow-testing-credentials",
},
},
{
name: params.dataVolume,
persistentVolumeClaim: {
claimName: params.nfsVolumeClaim,
},
},
], // volumes
// onExit specifies the template that should always run when the workflow completes.
onExit: "exit-handler",
// The templates will be a combination of the templates
// defining the dags executed by Argo as well as the templates
// for the individual steps.
templates: [dag, exitDag] + stepTemplates, // templates
}, // spec
}; // workflow
std.prune(k.core.v1.list.new([workflow]))

View File

@ -44,7 +44,7 @@ build: build-dir
build-gcb: build-dir
gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
--timeout=3600 ./build
--timeout=3600 ./.build
@echo Built $(IMG):$(TAG)
# Build but don't attach the latest tag. This allows manual testing/inspection of the image

View File

@ -692,7 +692,7 @@
],
"source": [
"from fairing.builders import cluster\n",
"preprocessor = ConvertNotebookPreprocessorWithFire(\"ModelServe\")\n",
"preprocessor = ConvertNotebookPreprocessorWithFire(class_name='ModelServe', notebook_file='build-train-deploy.ipynb')\n",
"\n",
"if not preprocessor.input_files:\n",
" preprocessor.input_files = set()\n",

View File

@ -10,3 +10,4 @@ retrying
seldon-core
sklearn
xgboost
tornado>=6.0.3

View File

@ -0,0 +1,43 @@
import tempfile
import logging
import os
import subprocess
logger = logging.getLogger(__name__)
def prepare_env():
subprocess.check_call(["pip3", "install", "-U", "papermill"])
subprocess.check_call(["pip3", "install", "-r", "../requirements.txt"])
def execute_notebook(notebook_path, parameters=None):
temp_dir = tempfile.mkdtemp()
notebook_output_path = os.path.join(temp_dir, "out.ipynb")
papermill.execute_notebook(notebook_path, notebook_output_path,
cwd=os.path.dirname(notebook_path),
parameters=parameters,
log_output=True)
return notebook_output_path
def run_notebook_test(notebook_path, expected_messages, parameters=None):
output_path = execute_notebook(notebook_path, parameters=parameters)
actual_output = open(output_path, 'r').read()
for expected_message in expected_messages:
if not expected_message in actual_output:
logger.error(actual_output)
assert False, "Unable to find from output: " + expected_message
if __name__ == "__main__":
prepare_env()
import papermill #pylint: disable=import-error
FILE_DIR = os.path.dirname(__file__)
NOTEBOOK_REL_PATH = "../build-train-deploy.ipynb"
NOTEBOOK_ABS_PATH = os.path.normpath(os.path.join(FILE_DIR, NOTEBOOK_REL_PATH))
EXPECTED_MGS = [
"Finished upload of",
"Model export success: mockup-model.dat",
"Pod started running True",
"Cluster endpoint: http:",
]
run_notebook_test(NOTEBOOK_ABS_PATH, EXPECTED_MGS)

View File

@ -0,0 +1,51 @@
apiVersion: batch/v1
kind: Job
metadata:
name: $(job_name)
spec:
template:
metadata:
labels:
app: xgboost-synthetics-testing
spec:
restartPolicy: Never
securityContext:
runAsUser: 0
initContainers:
# This init container checks out the source code.
- command:
- /usr/local/bin/checkout_repos.sh
- --repos=kubeflow/examples@$(CHECK_TAG)
- --src_dir=/src
- --depth=all
name: checkout
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
volumeMounts:
- mountPath: /src
name: src
env:
- name: CHECK_TAG
valueFrom:
configMapKeyRef:
name: xgb-notebooks-tests
key: checkTag
containers:
- name: executing-notebooks
image: execute-image
command: ["python3", "execute_notebook.py"]
workingDir: /src/kubeflow/examples/xgboost_synthetic/testing
volumeMounts:
- mountPath: /var/secrets
name: user-gcp-sa
readOnly: true
- mountPath: /src
name: src
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /var/secrets/user-gcp-sa.json
volumes:
- name: user-gcp-sa
secret:
secretName: user-gcp-sa
- name: src
emptyDir: {}

View File

@ -0,0 +1,28 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kubeflow
generatorOptions:
disableNameSuffixHash: true
resources:
- job.yaml
- role.yaml
- rolebinding.yaml
configurations:
- params.yaml
configMapGenerator:
- name: xgb-notebooks-tests
literals:
vars:
- fieldref:
fieldPath: data.name
name: job_name
objref:
apiVersion: v1
kind: ConfigMap
name: xgb-notebooks-tests

View File

@ -0,0 +1,3 @@
varReference:
- path: metadata/name
kind: Job

View File

@ -0,0 +1,37 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app: xgboost-synthetics-testing
name: xgboost-synthetics-testing-role
rules:
- apiGroups:
- ""
resources:
- pods
- pods/log
- secrets
- services
verbs:
- '*'
- apiGroups:
- ""
- apps
- extensions
resources:
- deployments
- replicasets
verbs:
- '*'
- apiGroups:
- kubeflow.org
resources:
- '*'
verbs:
- '*'
- apiGroups:
- batch
resources:
- jobs
verbs:
- '*'

View File

@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app: xgboost-synthetics-testing
name: xgboost-synthetics-testing-role-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: xgboost-synthetics-testing-role
subjects:
- kind: Group
name: system:serviceaccounts
apiGroup: rbac.authorization.k8s.io

View File

@ -0,0 +1,100 @@
import argparse
import logging
import os
import subprocess
import time
from kubeflow.testing import util
def create_job(args, app_dir): #pylint:disable=redefined-outer-name
'''Generate Job and summit.'''
util.run(['gcloud', 'auth', 'activate-service-account',
"--key-file=/secret/gcp-credentials/key.json"], cwd=app_dir)
util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
"clusters", "get-credentials", "--zone=us-east1-b", args.cluster], cwd=app_dir)
configmap = 'xgb-notebooks-tests'
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=name=' + args.name], cwd=app_dir)
# For presubmit, set the checkout tag as HEAD:$(PULL_NUMBER), others set to PULL_BASE_SHA
if args.jobType == 'presubmit':
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=checkTag=HEAD:' + args.pullNumber], cwd=app_dir)
else:
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
'--from-literal=checkTag=' + args.pullBaseSHA], cwd=app_dir)
util.run(['kustomize', 'edit', 'set', 'namespace', args.namespace], cwd=app_dir)
util.run(['kustomize', 'edit', 'set', 'image', 'execute-image=' + args.image], cwd=app_dir)
util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir)
util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)
logging.info("Created job %s in namespaces %s", args.name, args.namespace)
def get_pod_logs(name, namespace, app_dir): #pylint:disable=redefined-outer-name
'''Cannot get logs by k8s python api, using kubectl command to get logs.'''
logging.info("Getting pod %s logs...", name)
util.run(['kubectl', 'logs', name, '-n', namespace], cwd=app_dir)
def check_job_status(namespace, app_dir): #pylint:disable=redefined-outer-name
'''Cannot get job by k8s python api, using kubectl command to check job status.'''
is_successed = False
pod_info, pod_name, pod_status = '', '', ''
for _ in range(0, 30):
time.sleep(60)
subCmd = "kubectl get pod -n " + namespace + " | grep -m1 xgboost-test"
pod_info = subprocess.run(subCmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
universal_newlines=True)
if len(str(pod_info.stdout).split()) >= 2:
pod_name = str(pod_info.stdout).split()[0]
pod_status = str(pod_info.stdout).split()[2]
if pod_name:
if pod_status == "Pending":
logging.info("Pod %s is Pending.", pod_name)
elif pod_status == "Running":
logging.info("Pod %s is Running.", pod_name)
elif pod_status == "Completed":
logging.info("Pod %s is Completed.", pod_name)
get_pod_logs(pod_name, namespace, app_dir)
is_successed = True
break
elif pod_status == "Error":
get_pod_logs(pod_name, namespace, app_dir)
raise RuntimeError("Failed to execute notebook.")
else:
logging.warning("Pod %s status %s.", pod_name, pod_status)
else:
logging.warning("Cannot get the pod name, retry after 60 seconds.")
if not is_successed:
raise RuntimeError("Timeout to get the executing notebook pod after 30 munites.")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--name", help="deploy application name", type=str, required=True)
parser.add_argument(
"--namespace", help="The namespace for the application", type=str, required=True)
parser.add_argument(
"--image", help="Image name for the application", type=str, required=True)
parser.add_argument(
"--pullNumber", help="The PR number", type=str, required=True)
parser.add_argument(
"--pullBaseSHA", help="The pull base SHA", type=str, required=True)
parser.add_argument(
"--jobType", help="The job type such as presubmit or postsubmit", type=str, required=True)
parser.add_argument(
"--cluster", help="The cluster which the applition running in", type=str, required=True)
app_dir = os.path.dirname(__file__)
app_dir = os.path.abspath(app_dir)
args = parser.parse_args()
create_job(args, app_dir)
check_job_status(args.namespace, app_dir)