mirror of https://github.com/kubeflow/examples.git
add testing for xgboost_synthetic (#633)
This commit is contained in:
parent
e37a9d7acd
commit
4f8cf87d4f
|
|
@ -61,3 +61,14 @@ workflows:
|
|||
- postsubmit
|
||||
include_dirs:
|
||||
- pytorch_mnist/*
|
||||
|
||||
# E2E test for xgboost-synthetic
|
||||
- app_dir: kubeflow/examples/test/workflows
|
||||
component: xgboost_synthetic
|
||||
name: xgboost2
|
||||
job_types:
|
||||
- periodic
|
||||
- presubmit
|
||||
- postsubmit
|
||||
include_dirs:
|
||||
- xgboost_synthetic/*
|
||||
|
|
|
|||
|
|
@ -36,6 +36,12 @@
|
|||
namespace: "kubeflow-test-infra",
|
||||
prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
|
||||
},
|
||||
xgboost_synthetic: {
|
||||
bucket: "kubeflow-ci_temp",
|
||||
name: "kubeflow-xgboost_synthetic",
|
||||
namespace: "kubeflow-test-infra",
|
||||
prow_env: "BUILD_NUMBER=997a,BUILD_ID=997a,JOB_NAME=kubeflow-examples-presubmit-test,JOB_TYPE=presubmit,PULL_NUMBER=374,REPO_NAME=examples,REPO_OWNER=kubeflow",
|
||||
},
|
||||
workflows: {
|
||||
bucket: "kubeflow-ci_temp",
|
||||
name: "kubeflow-examples-presubmit-test-374-6e32",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,439 @@
|
|||
// Test workflow for XGBoost Housing example.
|
||||
//
|
||||
local env = std.extVar("__ksonnet/environments");
|
||||
local overrides = std.extVar("__ksonnet/params").components.xgboost_synthetic;
|
||||
|
||||
local k = import "k.libsonnet";
|
||||
local util = import "util.libsonnet";
|
||||
|
||||
// Define default params and then combine them with any overrides
|
||||
local defaultParams = {
|
||||
// local nfsVolumeClaim: "kubeflow-testing",
|
||||
nfsVolumeClaim: "nfs-external",
|
||||
|
||||
// The name to use for the volume to use to contain test data.
|
||||
dataVolume: "kubeflow-test-volume",
|
||||
|
||||
// Default step image:
|
||||
stepImage: "gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4",
|
||||
|
||||
// Which Kubeflow cluster to use for running PytorchJobs on.
|
||||
kfProject: "kubeflow-ci-deployment",
|
||||
kfZone: "us-east1-b",
|
||||
kfCluster: "kf-vmaster-n00",
|
||||
|
||||
// The bucket where the model should be written
|
||||
// This needs to be writable by the GCP service account in the Kubeflow cluster (not the test cluster)
|
||||
modelBucket: "kubeflow-ci_temp",
|
||||
|
||||
// Whether to delete the namespace at the end.
|
||||
// Leaving the namespace around can be useful for debugging.
|
||||
//
|
||||
// TODO(jlewi): We should consider running a cronjob to GC namespaces.
|
||||
// But if we leave namespaces up; then we end up leaving the servers up which
|
||||
// uses up CPU.
|
||||
//
|
||||
deleteNamespace: true,
|
||||
};
|
||||
|
||||
local params = defaultParams + overrides;
|
||||
|
||||
local prowEnv = util.parseEnv(params.prow_env);
|
||||
|
||||
// Create a dictionary of the different prow variables so we can refer to them in the workflow.
|
||||
//
|
||||
// Important: We want to initialize all variables we reference to some value. If we don't
|
||||
// and we reference a variable which doesn't get set then we get very hard to debug failure messages.
|
||||
// In particular, we've seen problems where if we add a new environment and evaluate one component eg. "workflows"
|
||||
// and another component e.g "code_search.jsonnet" doesn't have a default value for BUILD_ID then ksonnet
|
||||
// fails because BUILD_ID is undefined.
|
||||
local prowDict = {
|
||||
BUILD_ID: "notset",
|
||||
BUILD_NUMBER: "notset",
|
||||
REPO_OWNER: "notset",
|
||||
REPO_NAME: "notset",
|
||||
JOB_NAME: "notset",
|
||||
JOB_TYPE: "notset",
|
||||
PULL_NUMBER: "notset",
|
||||
PULL_BASE_SHA: "notset",
|
||||
} + util.listOfDictToMap(prowEnv);
|
||||
|
||||
local bucket = params.bucket;
|
||||
|
||||
// mountPath is the directory where the volume to store the test data
|
||||
// should be mounted.
|
||||
local mountPath = "/mnt/" + "test-data-volume";
|
||||
// testDir is the root directory for all data for a particular test run.
|
||||
local testDir = mountPath + "/" + params.name;
|
||||
// outputDir is the directory to sync to GCS to contain the output for this job.
|
||||
local outputDir = testDir + "/output";
|
||||
local artifactsDir = outputDir + "/artifacts";
|
||||
|
||||
// Source directory where all repos should be checked out
|
||||
local srcRootDir = testDir + "/src";
|
||||
|
||||
// The directory containing the kubeflow/kubeflow repo
|
||||
local srcDir = srcRootDir + "/" + prowDict.REPO_OWNER + "/" + prowDict.REPO_NAME;
|
||||
|
||||
// These variables control where the docker images get pushed and what
|
||||
// tag to use
|
||||
local executeImage = "gcr.io/kubeflow-images-public/tensorflow-1.13.1-notebook-cpu:v0.5.0";
|
||||
|
||||
// value of KUBECONFIG environment variable. This should be a full path.
|
||||
local kubeConfig = testDir + "/.kube/kubeconfig";
|
||||
|
||||
// Namespace where tests should run
|
||||
local testNamespace = "xgboost-synthetic-" + prowDict["BUILD_ID"];
|
||||
|
||||
// The directory within the kubeflow_testing submodule containing
|
||||
// py scripts to use.
|
||||
local kubeflowTestingPy = srcRootDir + "/kubeflow/testing/py";
|
||||
|
||||
// Workflow template is the name of the workflow template; typically the name of the ks component.
|
||||
// This is used as a label to make it easy to identify all Argo workflows created from a given
|
||||
// template.
|
||||
local workflow_template = "xgboost_synthetic";
|
||||
|
||||
// Build template is a template for constructing Argo step templates.
|
||||
//
|
||||
// step_name: Name for the template
|
||||
// command: List to pass as the container command.
|
||||
//
|
||||
// We customize the defaults for each step in the workflow by modifying
|
||||
// buildTemplate.argoTemplate
|
||||
local buildTemplate = {
|
||||
// name & command variables should be overwritten for every test.
|
||||
// Other variables can be changed per step as needed.
|
||||
// They are hidden because they shouldn't be included in the Argo template
|
||||
name: "",
|
||||
command:: "",
|
||||
image: params.stepImage,
|
||||
workingDir:: null,
|
||||
env_vars:: [],
|
||||
side_cars: [],
|
||||
pythonPath: kubeflowTestingPy,
|
||||
|
||||
activeDeadlineSeconds: 1800, // Set 30 minute timeout for each template
|
||||
|
||||
local template = self,
|
||||
|
||||
// Actual template for Argo
|
||||
argoTemplate: {
|
||||
name: template.name,
|
||||
metadata: {
|
||||
labels: prowDict + {
|
||||
workflow: params.name,
|
||||
workflow_template: workflow_template,
|
||||
step_name: template.name,
|
||||
},
|
||||
},
|
||||
container: {
|
||||
command: template.command,
|
||||
name: template.name,
|
||||
image: template.image,
|
||||
workingDir: template.workingDir,
|
||||
env: [
|
||||
{
|
||||
// Add the source directories to the python path.
|
||||
name: "PYTHONPATH",
|
||||
value: template.pythonPath,
|
||||
},
|
||||
{
|
||||
name: "GOOGLE_APPLICATION_CREDENTIALS",
|
||||
value: "/secret/gcp-credentials/key.json",
|
||||
},
|
||||
{
|
||||
name: "GITHUB_TOKEN",
|
||||
valueFrom: {
|
||||
secretKeyRef: {
|
||||
name: "github-token",
|
||||
key: "github_token",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
// We use a directory in our NFS share to store our kube config.
|
||||
// This way we can configure it on a single step and reuse it on subsequent steps.
|
||||
name: "KUBECONFIG",
|
||||
value: kubeConfig,
|
||||
},
|
||||
] + prowEnv + template.env_vars,
|
||||
volumeMounts: [
|
||||
{
|
||||
name: params.dataVolume,
|
||||
mountPath: mountPath,
|
||||
},
|
||||
{
|
||||
name: "github-token",
|
||||
mountPath: "/secret/github-token",
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
mountPath: "/secret/gcp-credentials",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}; // buildTemplate
|
||||
|
||||
|
||||
// Create a list of dictionary.
|
||||
// Each item is a dictionary describing one step in the graph.
|
||||
local dagTemplates = [
|
||||
{
|
||||
template: buildTemplate {
|
||||
name: "checkout",
|
||||
command:
|
||||
["/usr/local/bin/checkout.sh", srcRootDir],
|
||||
|
||||
env_vars: [{
|
||||
name: "EXTRA_REPOS",
|
||||
// TODO(jlewi): Pin to commit on master when #281 is checked in.
|
||||
value: "kubeflow/testing@HEAD:281",
|
||||
}],
|
||||
},
|
||||
dependencies: null,
|
||||
}, // checkout
|
||||
{
|
||||
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
|
||||
// should be done by run_e2e_workflow.py
|
||||
template: buildTemplate {
|
||||
name: "create-pr-symlink",
|
||||
command: [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.prow_artifacts",
|
||||
"--artifacts_dir=" + outputDir,
|
||||
"create_pr_symlink",
|
||||
"--bucket=" + params.bucket,
|
||||
],
|
||||
}, // create-pr-symlink
|
||||
dependencies: ["checkout"],
|
||||
}, // create-pr-symlink
|
||||
{
|
||||
// Configure KUBECONFIG
|
||||
template: buildTemplate {
|
||||
name: "get-kubeconfig",
|
||||
command: util.buildCommand([
|
||||
[
|
||||
"gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
|
||||
],
|
||||
[
|
||||
"gcloud",
|
||||
"--project=" + params.kfProject,
|
||||
"container",
|
||||
"clusters",
|
||||
"get-credentials",
|
||||
"--zone=" + params.kfZone,
|
||||
params.kfCluster,
|
||||
]]
|
||||
),
|
||||
},
|
||||
dependencies: ["checkout"],
|
||||
}, // get-kubeconfig
|
||||
{
|
||||
// Create the namespace
|
||||
// TODO(jlewi): We should add some sort of retry.
|
||||
template: buildTemplate {
|
||||
name: "create-namespace",
|
||||
command: util.buildCommand([
|
||||
[
|
||||
"echo",
|
||||
"KUBECONFIG=",
|
||||
"${KUBECONFIG}",
|
||||
],
|
||||
[
|
||||
"gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"config" ,
|
||||
"current-context",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"create",
|
||||
"namespace",
|
||||
testNamespace,
|
||||
],
|
||||
# Copy the GCP secret from the kubeflow namespace to the test namespace
|
||||
[
|
||||
srcDir + "/test/copy_secret.sh",
|
||||
"kubeflow",
|
||||
testNamespace,
|
||||
"user-gcp-sa",
|
||||
]]
|
||||
),
|
||||
},
|
||||
dependencies: ["get-kubeconfig"],
|
||||
}, // create-namespace
|
||||
{
|
||||
template: buildTemplate {
|
||||
name: "execute-notebook",
|
||||
command: [
|
||||
"python3",
|
||||
"xgboost_test.py",
|
||||
"--name=" + "xgboost-test-" + prowDict["BUILD_ID"],
|
||||
"--namespace=" + testNamespace,
|
||||
"--image=" + executeImage,
|
||||
"--jobType=" + prowDict["JOB_TYPE"],
|
||||
"--pullNumber=" + prowDict["PULL_NUMBER"],
|
||||
"--pullBaseSHA=" + prowDict["PULL_BASE_SHA"],
|
||||
"--cluster=" + params.kfCluster,
|
||||
],
|
||||
pythonPath: kubeflowTestingPy,
|
||||
workingDir: srcDir + "/xgboost_synthetic/testing",
|
||||
},
|
||||
dependencies: ["create-namespace"],
|
||||
}, // execute-notebook
|
||||
];
|
||||
|
||||
// Dag defines the tasks in the graph
|
||||
local dag = {
|
||||
name: "e2e",
|
||||
// Construct tasks from the templates
|
||||
// we will give the steps the same name as the template
|
||||
dag: {
|
||||
tasks: util.toArgoTaskList(dagTemplates),
|
||||
},
|
||||
}; // dag
|
||||
|
||||
// Define templates for the steps to be performed when the
|
||||
// test exits
|
||||
|
||||
local deleteTemplates = if params.deleteNamespace then
|
||||
[
|
||||
{
|
||||
// Delete the namespace
|
||||
// TODO(jlewi): We should add some sort of retry.
|
||||
template: buildTemplate {
|
||||
name: "delete-namespace",
|
||||
command: util.buildCommand([
|
||||
[
|
||||
"gcloud",
|
||||
"auth",
|
||||
"activate-service-account",
|
||||
"--key-file=${GOOGLE_APPLICATION_CREDENTIALS}",
|
||||
],
|
||||
[
|
||||
"kubectl",
|
||||
"delete",
|
||||
"namespace",
|
||||
testNamespace,
|
||||
]]
|
||||
),
|
||||
},
|
||||
}, // delete-namespace
|
||||
] else [];
|
||||
|
||||
local exitTemplates =
|
||||
deleteTemplates +
|
||||
[
|
||||
{
|
||||
// Copy artifacts to GCS for gubernator.
|
||||
// TODO(https://github.com/kubeflow/testing/issues/257): Create-pr-symlink
|
||||
// should be done by run_e2e_workflow.py
|
||||
template: buildTemplate {
|
||||
name: "copy-artifacts",
|
||||
command: [
|
||||
"python",
|
||||
"-m",
|
||||
"kubeflow.testing.prow_artifacts",
|
||||
"--artifacts_dir=" + outputDir,
|
||||
"copy_artifacts",
|
||||
"--bucket=" + bucket,
|
||||
],
|
||||
}, // copy-artifacts,
|
||||
},
|
||||
{
|
||||
// Delete the test directory in NFS.
|
||||
// TODO(https://github.com/kubeflow/testing/issues/256): Use an external process to do this.
|
||||
template:
|
||||
buildTemplate {
|
||||
name: "test-dir-delete",
|
||||
command: [
|
||||
"rm",
|
||||
"-rf",
|
||||
testDir,
|
||||
],
|
||||
|
||||
argoTemplate+: {
|
||||
retryStrategy: {
|
||||
limit: 3,
|
||||
},
|
||||
},
|
||||
}, // test-dir-delete
|
||||
dependencies: ["copy-artifacts"] + if params.deleteNamespace then ["delete-namespace"] else [],
|
||||
},
|
||||
];
|
||||
|
||||
// Create a DAG representing the set of steps to execute on exit
|
||||
local exitDag = {
|
||||
name: "exit-handler",
|
||||
// Construct tasks from the templates
|
||||
// we will give the steps the same name as the template
|
||||
dag: {
|
||||
tasks: util.toArgoTaskList(exitTemplates),
|
||||
},
|
||||
};
|
||||
|
||||
// A list of templates for the actual steps
|
||||
local stepTemplates = std.map(function(i) i.template.argoTemplate
|
||||
, dagTemplates) +
|
||||
std.map(function(i) i.template.argoTemplate
|
||||
, exitTemplates);
|
||||
|
||||
// Define the Argo Workflow.
|
||||
local workflow = {
|
||||
apiVersion: "argoproj.io/v1alpha1",
|
||||
kind: "Workflow",
|
||||
metadata: {
|
||||
name: params.name,
|
||||
namespace: env.namespace,
|
||||
labels: prowDict + {
|
||||
workflow: params.name,
|
||||
workflow_template: workflow_template,
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
entrypoint: "e2e",
|
||||
// Have argo garbage collect old workflows otherwise we overload the API server.
|
||||
ttlSecondsAfterFinished: 7 * 24 * 60 * 60,
|
||||
volumes: [
|
||||
{
|
||||
name: "github-token",
|
||||
secret: {
|
||||
secretName: "github-token",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "gcp-credentials",
|
||||
secret: {
|
||||
secretName: "kubeflow-testing-credentials",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: params.dataVolume,
|
||||
persistentVolumeClaim: {
|
||||
claimName: params.nfsVolumeClaim,
|
||||
},
|
||||
},
|
||||
], // volumes
|
||||
|
||||
// onExit specifies the template that should always run when the workflow completes.
|
||||
onExit: "exit-handler",
|
||||
|
||||
// The templates will be a combination of the templates
|
||||
// defining the dags executed by Argo as well as the templates
|
||||
// for the individual steps.
|
||||
templates: [dag, exitDag] + stepTemplates, // templates
|
||||
}, // spec
|
||||
}; // workflow
|
||||
|
||||
std.prune(k.core.v1.list.new([workflow]))
|
||||
|
|
@ -44,7 +44,7 @@ build: build-dir
|
|||
|
||||
build-gcb: build-dir
|
||||
gcloud builds submit --machine-type=n1-highcpu-32 --project=$(PROJECT) --tag=$(IMG):$(TAG) \
|
||||
--timeout=3600 ./build
|
||||
--timeout=3600 ./.build
|
||||
@echo Built $(IMG):$(TAG)
|
||||
|
||||
# Build but don't attach the latest tag. This allows manual testing/inspection of the image
|
||||
|
|
|
|||
|
|
@ -692,7 +692,7 @@
|
|||
],
|
||||
"source": [
|
||||
"from fairing.builders import cluster\n",
|
||||
"preprocessor = ConvertNotebookPreprocessorWithFire(\"ModelServe\")\n",
|
||||
"preprocessor = ConvertNotebookPreprocessorWithFire(class_name='ModelServe', notebook_file='build-train-deploy.ipynb')\n",
|
||||
"\n",
|
||||
"if not preprocessor.input_files:\n",
|
||||
" preprocessor.input_files = set()\n",
|
||||
|
|
|
|||
|
|
@ -10,3 +10,4 @@ retrying
|
|||
seldon-core
|
||||
sklearn
|
||||
xgboost
|
||||
tornado>=6.0.3
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
import tempfile
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def prepare_env():
|
||||
subprocess.check_call(["pip3", "install", "-U", "papermill"])
|
||||
subprocess.check_call(["pip3", "install", "-r", "../requirements.txt"])
|
||||
|
||||
|
||||
def execute_notebook(notebook_path, parameters=None):
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
notebook_output_path = os.path.join(temp_dir, "out.ipynb")
|
||||
papermill.execute_notebook(notebook_path, notebook_output_path,
|
||||
cwd=os.path.dirname(notebook_path),
|
||||
parameters=parameters,
|
||||
log_output=True)
|
||||
return notebook_output_path
|
||||
|
||||
def run_notebook_test(notebook_path, expected_messages, parameters=None):
|
||||
output_path = execute_notebook(notebook_path, parameters=parameters)
|
||||
actual_output = open(output_path, 'r').read()
|
||||
for expected_message in expected_messages:
|
||||
if not expected_message in actual_output:
|
||||
logger.error(actual_output)
|
||||
assert False, "Unable to find from output: " + expected_message
|
||||
|
||||
if __name__ == "__main__":
|
||||
prepare_env()
|
||||
import papermill #pylint: disable=import-error
|
||||
FILE_DIR = os.path.dirname(__file__)
|
||||
NOTEBOOK_REL_PATH = "../build-train-deploy.ipynb"
|
||||
NOTEBOOK_ABS_PATH = os.path.normpath(os.path.join(FILE_DIR, NOTEBOOK_REL_PATH))
|
||||
EXPECTED_MGS = [
|
||||
"Finished upload of",
|
||||
"Model export success: mockup-model.dat",
|
||||
"Pod started running True",
|
||||
"Cluster endpoint: http:",
|
||||
]
|
||||
run_notebook_test(NOTEBOOK_ABS_PATH, EXPECTED_MGS)
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: $(job_name)
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: xgboost-synthetics-testing
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
initContainers:
|
||||
# This init container checks out the source code.
|
||||
- command:
|
||||
- /usr/local/bin/checkout_repos.sh
|
||||
- --repos=kubeflow/examples@$(CHECK_TAG)
|
||||
- --src_dir=/src
|
||||
- --depth=all
|
||||
name: checkout
|
||||
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
|
||||
volumeMounts:
|
||||
- mountPath: /src
|
||||
name: src
|
||||
env:
|
||||
- name: CHECK_TAG
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: xgb-notebooks-tests
|
||||
key: checkTag
|
||||
containers:
|
||||
- name: executing-notebooks
|
||||
image: execute-image
|
||||
command: ["python3", "execute_notebook.py"]
|
||||
workingDir: /src/kubeflow/examples/xgboost_synthetic/testing
|
||||
volumeMounts:
|
||||
- mountPath: /var/secrets
|
||||
name: user-gcp-sa
|
||||
readOnly: true
|
||||
- mountPath: /src
|
||||
name: src
|
||||
env:
|
||||
- name: GOOGLE_APPLICATION_CREDENTIALS
|
||||
value: /var/secrets/user-gcp-sa.json
|
||||
volumes:
|
||||
- name: user-gcp-sa
|
||||
secret:
|
||||
secretName: user-gcp-sa
|
||||
- name: src
|
||||
emptyDir: {}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: kubeflow
|
||||
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
resources:
|
||||
- job.yaml
|
||||
- role.yaml
|
||||
- rolebinding.yaml
|
||||
|
||||
configurations:
|
||||
- params.yaml
|
||||
|
||||
configMapGenerator:
|
||||
- name: xgb-notebooks-tests
|
||||
literals:
|
||||
|
||||
vars:
|
||||
- fieldref:
|
||||
fieldPath: data.name
|
||||
name: job_name
|
||||
objref:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
name: xgb-notebooks-tests
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
varReference:
|
||||
- path: metadata/name
|
||||
kind: Job
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
labels:
|
||||
app: xgboost-synthetics-testing
|
||||
name: xgboost-synthetics-testing-role
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
- pods/log
|
||||
- secrets
|
||||
- services
|
||||
verbs:
|
||||
- '*'
|
||||
- apiGroups:
|
||||
- ""
|
||||
- apps
|
||||
- extensions
|
||||
resources:
|
||||
- deployments
|
||||
- replicasets
|
||||
verbs:
|
||||
- '*'
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- '*'
|
||||
verbs:
|
||||
- '*'
|
||||
- apiGroups:
|
||||
- batch
|
||||
resources:
|
||||
- jobs
|
||||
verbs:
|
||||
- '*'
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app: xgboost-synthetics-testing
|
||||
name: xgboost-synthetics-testing-role-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: xgboost-synthetics-testing-role
|
||||
subjects:
|
||||
- kind: Group
|
||||
name: system:serviceaccounts
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from kubeflow.testing import util
|
||||
|
||||
def create_job(args, app_dir): #pylint:disable=redefined-outer-name
|
||||
'''Generate Job and summit.'''
|
||||
util.run(['gcloud', 'auth', 'activate-service-account',
|
||||
"--key-file=/secret/gcp-credentials/key.json"], cwd=app_dir)
|
||||
util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
|
||||
"clusters", "get-credentials", "--zone=us-east1-b", args.cluster], cwd=app_dir)
|
||||
|
||||
configmap = 'xgb-notebooks-tests'
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=name=' + args.name], cwd=app_dir)
|
||||
# For presubmit, set the checkout tag as HEAD:$(PULL_NUMBER), others set to PULL_BASE_SHA
|
||||
if args.jobType == 'presubmit':
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=checkTag=HEAD:' + args.pullNumber], cwd=app_dir)
|
||||
else:
|
||||
util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
|
||||
'--from-literal=checkTag=' + args.pullBaseSHA], cwd=app_dir)
|
||||
util.run(['kustomize', 'edit', 'set', 'namespace', args.namespace], cwd=app_dir)
|
||||
util.run(['kustomize', 'edit', 'set', 'image', 'execute-image=' + args.image], cwd=app_dir)
|
||||
util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir)
|
||||
util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)
|
||||
logging.info("Created job %s in namespaces %s", args.name, args.namespace)
|
||||
|
||||
def get_pod_logs(name, namespace, app_dir): #pylint:disable=redefined-outer-name
|
||||
'''Cannot get logs by k8s python api, using kubectl command to get logs.'''
|
||||
logging.info("Getting pod %s logs...", name)
|
||||
util.run(['kubectl', 'logs', name, '-n', namespace], cwd=app_dir)
|
||||
|
||||
def check_job_status(namespace, app_dir): #pylint:disable=redefined-outer-name
|
||||
'''Cannot get job by k8s python api, using kubectl command to check job status.'''
|
||||
is_successed = False
|
||||
pod_info, pod_name, pod_status = '', '', ''
|
||||
for _ in range(0, 30):
|
||||
time.sleep(60)
|
||||
subCmd = "kubectl get pod -n " + namespace + " | grep -m1 xgboost-test"
|
||||
pod_info = subprocess.run(subCmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=True,
|
||||
universal_newlines=True)
|
||||
if len(str(pod_info.stdout).split()) >= 2:
|
||||
pod_name = str(pod_info.stdout).split()[0]
|
||||
pod_status = str(pod_info.stdout).split()[2]
|
||||
|
||||
if pod_name:
|
||||
if pod_status == "Pending":
|
||||
logging.info("Pod %s is Pending.", pod_name)
|
||||
elif pod_status == "Running":
|
||||
logging.info("Pod %s is Running.", pod_name)
|
||||
elif pod_status == "Completed":
|
||||
logging.info("Pod %s is Completed.", pod_name)
|
||||
get_pod_logs(pod_name, namespace, app_dir)
|
||||
is_successed = True
|
||||
break
|
||||
elif pod_status == "Error":
|
||||
get_pod_logs(pod_name, namespace, app_dir)
|
||||
raise RuntimeError("Failed to execute notebook.")
|
||||
else:
|
||||
logging.warning("Pod %s status %s.", pod_name, pod_status)
|
||||
else:
|
||||
logging.warning("Cannot get the pod name, retry after 60 seconds.")
|
||||
|
||||
if not is_successed:
|
||||
raise RuntimeError("Timeout to get the executing notebook pod after 30 munites.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--name", help="deploy application name", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--namespace", help="The namespace for the application", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--image", help="Image name for the application", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--pullNumber", help="The PR number", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--pullBaseSHA", help="The pull base SHA", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--jobType", help="The job type such as presubmit or postsubmit", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--cluster", help="The cluster which the applition running in", type=str, required=True)
|
||||
|
||||
app_dir = os.path.dirname(__file__)
|
||||
app_dir = os.path.abspath(app_dir)
|
||||
|
||||
args = parser.parse_args()
|
||||
create_job(args, app_dir)
|
||||
check_job_status(args.namespace, app_dir)
|
||||
Loading…
Reference in New Issue