mirror of https://github.com/kubeflow/examples.git
Fix the xgboost_synthetic test so it actually runs and produces signal (#674)
* Fix the xgboost_synthetic test so it actually runs and produces signal * The test wasn't actually running because we were passing arguments that were unknown to pytest * Remove the old role.yaml; we don't use it anymore * Wait for the Job to finish and properly report status; kubeflow/testing#514 contains the new routine * The test still isn't passing because of kubeflow/examples#673 * In addition we need to fix the auto deployments kubeflow/testing#444 Related to kubeflow/examples#665 * Fix lint.
This commit is contained in:
parent
452aa428b6
commit
e2198ce1e8
|
|
@ -238,11 +238,9 @@ class Builder:
|
||||||
# Test xgboost
|
# Test xgboost
|
||||||
step_name = "xgboost-synthetic"
|
step_name = "xgboost-synthetic"
|
||||||
command = ["pytest", "xgboost_test.py",
|
command = ["pytest", "xgboost_test.py",
|
||||||
# I think -s mean stdout/stderr will print out to aid in debugging.
|
|
||||||
# Failures still appear to be captured and stored in the junit file.
|
|
||||||
"-s",
|
|
||||||
# Increase the log level so that info level log statements show up.
|
# Increase the log level so that info level log statements show up.
|
||||||
"--log-cli-level=info",
|
"--log-cli-level=info",
|
||||||
|
"--log-cli-format='%(levelname)s|%(asctime)s|%(pathname)s|%(lineno)d| %(message)'",
|
||||||
# Test timeout in seconds.
|
# Test timeout in seconds.
|
||||||
"--timeout=1800",
|
"--timeout=1800",
|
||||||
"--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml",
|
"--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml",
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,6 @@ def pytest_addoption(parser):
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
"--repos", help="The repos to checkout; leave blank to use defaults",
|
"--repos", help="The repos to checkout; leave blank to use defaults",
|
||||||
type=str, default="")
|
type=str, default="")
|
||||||
parser.addoption(
|
|
||||||
"--cluster", help="The cluster which the applition running in", type=str,
|
|
||||||
default="")
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def name(request):
|
def name(request):
|
||||||
|
|
@ -33,7 +30,3 @@ def image(request):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def repos(request):
|
def repos(request):
|
||||||
return request.config.getoption("--repos")
|
return request.config.getoption("--repos")
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def cluster(request):
|
|
||||||
return request.config.getoption("--cluster")
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,10 @@ apiVersion: batch/v1
|
||||||
kind: Job
|
kind: Job
|
||||||
metadata:
|
metadata:
|
||||||
name: xgboost-test
|
name: xgboost-test
|
||||||
|
labels:
|
||||||
|
app: xgboost-synthetic-test
|
||||||
spec:
|
spec:
|
||||||
|
backoffLimit: 1
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
annotations:
|
annotations:
|
||||||
|
|
@ -11,7 +14,7 @@ spec:
|
||||||
# in notebooks?
|
# in notebooks?
|
||||||
sidecar.istio.io/inject: "false"
|
sidecar.istio.io/inject: "false"
|
||||||
labels:
|
labels:
|
||||||
app: xgboost-synthetics-testing
|
app: xgboost-synthetic-test
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: Never
|
restartPolicy: Never
|
||||||
securityContext:
|
securityContext:
|
||||||
|
|
@ -22,20 +25,11 @@ spec:
|
||||||
- /usr/local/bin/checkout_repos.sh
|
- /usr/local/bin/checkout_repos.sh
|
||||||
- --repos=kubeflow/examples@$(CHECK_TAG)
|
- --repos=kubeflow/examples@$(CHECK_TAG)
|
||||||
- --src_dir=/src
|
- --src_dir=/src
|
||||||
# TODO(jlewi): Do we need to do depth all here?
|
|
||||||
- --depth=all
|
|
||||||
name: checkout
|
name: checkout
|
||||||
# TODO(jlewi): Set in kustomization.yaml?
|
|
||||||
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
|
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /src
|
- mountPath: /src
|
||||||
name: src
|
name: src
|
||||||
env:
|
|
||||||
- name: CHECK_TAG
|
|
||||||
valueFrom:
|
|
||||||
configMapKeyRef:
|
|
||||||
name: xgb-notebooks-tests
|
|
||||||
key: checkTag
|
|
||||||
containers:
|
containers:
|
||||||
- name: executing-notebooks
|
- name: executing-notebooks
|
||||||
image: execute-image
|
image: execute-image
|
||||||
|
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: Role
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: xgboost-synthetics-testing
|
|
||||||
name: xgboost-synthetics-testing-role
|
|
||||||
rules:
|
|
||||||
- apiGroups:
|
|
||||||
- ""
|
|
||||||
resources:
|
|
||||||
- pods
|
|
||||||
- pods/log
|
|
||||||
- secrets
|
|
||||||
- services
|
|
||||||
verbs:
|
|
||||||
- '*'
|
|
||||||
- apiGroups:
|
|
||||||
- ""
|
|
||||||
- apps
|
|
||||||
- extensions
|
|
||||||
resources:
|
|
||||||
- deployments
|
|
||||||
- replicasets
|
|
||||||
verbs:
|
|
||||||
- '*'
|
|
||||||
- apiGroups:
|
|
||||||
- kubeflow.org
|
|
||||||
resources:
|
|
||||||
- '*'
|
|
||||||
verbs:
|
|
||||||
- '*'
|
|
||||||
- apiGroups:
|
|
||||||
- batch
|
|
||||||
resources:
|
|
||||||
- jobs
|
|
||||||
verbs:
|
|
||||||
- '*'
|
|
||||||
|
|
@ -1,36 +1,27 @@
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import uuid
|
import uuid
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from kubernetes import client as k8s_client
|
from kubernetes import client as k8s_client
|
||||||
from kubernetes.client import rest
|
from kubeflow.testing import argo_build_util
|
||||||
from kubeflow.testing import util
|
from kubeflow.testing import util
|
||||||
|
|
||||||
# TODO(jlewi): This test is currently failing because various things
|
# TODO(jlewi): This test is currently failing because various things
|
||||||
# need to be updated to work with 0.7.0. Until that's fixed we mark it
|
# need to be updated to work with 0.7.0. Until that's fixed we mark it
|
||||||
# as expected to fail so we can begin to get signal.
|
# as expected to fail on presubmits. We only mark it as expected to fail
|
||||||
@pytest.mark.xfail
|
# on presubmits because if expected failures don't show up in test grid
|
||||||
def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pylint: disable=too-many-branches,too-many-statements
|
# and we want signal in postsubmits and periodics
|
||||||
repos, image, app_dir):
|
@pytest.mark.xfail(os.getenv("JOB_TYPE") == "presubmit", reason="Flaky")
|
||||||
|
def test_xgboost_synthetic(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements
|
||||||
|
repos, image):
|
||||||
'''Generate Job and summit.'''
|
'''Generate Job and summit.'''
|
||||||
util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")
|
util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")
|
||||||
app_dir = os.path.abspath(app_dir)
|
|
||||||
|
|
||||||
if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
|
util.maybe_activate_service_account()
|
||||||
util.run(['gcloud', 'auth', 'activate-service-account',
|
|
||||||
"--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")],
|
|
||||||
cwd=app_dir)
|
|
||||||
|
|
||||||
# TODO(jlewi): We should just assume that kubeconfig has been set.
|
|
||||||
if cluster:
|
|
||||||
util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
|
|
||||||
"clusters", "get-credentials", "--zone=us-east1-b", cluster],
|
|
||||||
cwd=app_dir)
|
|
||||||
|
|
||||||
with open("job.yaml") as hf:
|
with open("job.yaml") as hf:
|
||||||
job = yaml.load(hf)
|
job = yaml.load(hf)
|
||||||
|
|
@ -42,16 +33,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
|
||||||
# See
|
# See
|
||||||
# https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
|
# https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
|
||||||
if not repos:
|
if not repos:
|
||||||
version = "@HEAD"
|
repos = argo_build_util.get_repo_from_prow_env()
|
||||||
if os.getenv("PULL_NUMBER"):
|
|
||||||
version = "@{0}:{1}".format(os.getenv("PULL_PULL_SHA"),
|
|
||||||
os.getenv("PULL_NUMBER"))
|
|
||||||
|
|
||||||
else:
|
|
||||||
if os.getenv("PULL_BASE_SHA"):
|
|
||||||
version = "@{0}".format(os.getenv("PULL_BASE_SHA"))
|
|
||||||
|
|
||||||
repos = "kubeflow/examples" + version
|
|
||||||
|
|
||||||
logging.info("Repos set to %s", repos)
|
logging.info("Repos set to %s", repos)
|
||||||
job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
|
job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
|
||||||
|
|
@ -69,6 +51,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
|
||||||
job["metadata"]["name"] = ("xgboost-test-" +
|
job["metadata"]["name"] = ("xgboost-test-" +
|
||||||
datetime.datetime.now().strftime("%H%M%S")
|
datetime.datetime.now().strftime("%H%M%S")
|
||||||
+ "-" + uuid.uuid4().hex[0:3])
|
+ "-" + uuid.uuid4().hex[0:3])
|
||||||
|
name = job["metadata"]["name"]
|
||||||
|
|
||||||
job["metadata"]["namespace"] = namespace
|
job["metadata"]["namespace"] = namespace
|
||||||
|
|
||||||
|
|
@ -77,43 +60,24 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
|
||||||
batch_api = k8s_client.BatchV1Api(api_client)
|
batch_api = k8s_client.BatchV1Api(api_client)
|
||||||
|
|
||||||
logging.info("Creating job:\n%s", yaml.dump(job))
|
logging.info("Creating job:\n%s", yaml.dump(job))
|
||||||
batch_api.create_namespaced_job(job["metadata"]["namespace"], job)
|
actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
|
||||||
logging.info("Created job %s in namespaces %s", name, namespace)
|
job)
|
||||||
|
logging.info("Created job %s.%s:\n%s", namespace, name,
|
||||||
|
yaml.safe_dump(actual_job))
|
||||||
|
|
||||||
# Wait for job.
|
final_job = util.wait_for_job(api_client, namespace, name,
|
||||||
end_time = datetime.datetime.now() + datetime.timedelta(
|
timeout=datetime.timedelta(minutes=30))
|
||||||
minutes=15)
|
|
||||||
|
|
||||||
last_condition = None
|
logging.info("Final job:\n%s", yaml.safe_dump(final_job))
|
||||||
while datetime.datetime.now() < end_time:
|
|
||||||
try:
|
|
||||||
job = batch_api.read_namespaced_job(name, namespace)
|
|
||||||
except rest.ApiException as e:
|
|
||||||
logging.error("There was a problem getting job %s.%s; %s",
|
|
||||||
namespace, name, e)
|
|
||||||
time.sleep(10)
|
|
||||||
continue
|
|
||||||
# ready_replicas could be None
|
|
||||||
if not job.conditions:
|
|
||||||
logging.info("Job missing condition")
|
|
||||||
time.sleep(10)
|
|
||||||
continue
|
|
||||||
|
|
||||||
last_condition = job.conditions[-1]
|
if not job.status.conditions:
|
||||||
if last_condition["type"] in ["Failed", "Complete"]:
|
raise RuntimeError("Job {0}.{1}; did not complete".format(namespace, name))
|
||||||
break
|
|
||||||
logging.info("Waiting for job %s.%s", namespace, name)
|
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
logging.info("Final Job spec:\n%s", yaml.safe_dump(job))
|
last_condition = job.status.conditions[-1]
|
||||||
util.run(["kubectl", "describe", "job", "-n", namespace, name])
|
|
||||||
|
|
||||||
if not last_condition or last_condition["type"] not in ["Failed", "Complete"]:
|
if last_condition.type not in ["Complete"]:
|
||||||
logging.error("Timeout waiting for job %s.%s to finish.", namespace, name)
|
logging.error("Job didn't complete successfully")
|
||||||
raise RuntimeError("Job {0}.{1} has last condition {2} which is not "
|
raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
|
||||||
"Complete".format(namespace, name,
|
|
||||||
last_condition["type"] in ["Failed", "Complete"]))
|
|
||||||
assert last_condition["type"] == "Complete"
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(level=logging.INFO,
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue