Fix the xgboost_synthetic test so it actually runs and produces signal (#674)

* Fix the xgboost_synthetic test so it actually runs and produces signal

* The test wasn't actually running because we were passing arguments that
  were unknown to pytest

* Remove the old role.yaml; we don't use it anymore

* Wait for the Job to finish and properly report status; kubeflow/testing#514
  contains the new routine

* The test still isn't passing because of kubeflow/examples#673

* In addition we need to fix the auto deployments kubeflow/testing#444

Related to kubeflow/examples#665

* Fix lint.
This commit is contained in:
Jeremy Lewi 2019-11-04 21:56:38 -08:00 committed by Kubernetes Prow Robot
parent 452aa428b6
commit e2198ce1e8
5 changed files with 28 additions and 116 deletions

View File

@ -238,11 +238,9 @@ class Builder:
# Test xgboost # Test xgboost
step_name = "xgboost-synthetic" step_name = "xgboost-synthetic"
command = ["pytest", "xgboost_test.py", command = ["pytest", "xgboost_test.py",
# I think -s mean stdout/stderr will print out to aid in debugging.
# Failures still appear to be captured and stored in the junit file.
"-s",
# Increase the log level so that info level log statements show up. # Increase the log level so that info level log statements show up.
"--log-cli-level=info", "--log-cli-level=info",
"--log-cli-format='%(levelname)s|%(asctime)s|%(pathname)s|%(lineno)d| %(message)'",
# Test timeout in seconds. # Test timeout in seconds.
"--timeout=1800", "--timeout=1800",
"--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml", "--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml",

View File

@ -14,9 +14,6 @@ def pytest_addoption(parser):
parser.addoption( parser.addoption(
"--repos", help="The repos to checkout; leave blank to use defaults", "--repos", help="The repos to checkout; leave blank to use defaults",
type=str, default="") type=str, default="")
parser.addoption(
"--cluster", help="The cluster which the applition running in", type=str,
default="")
@pytest.fixture @pytest.fixture
def name(request): def name(request):
@ -33,7 +30,3 @@ def image(request):
@pytest.fixture @pytest.fixture
def repos(request): def repos(request):
return request.config.getoption("--repos") return request.config.getoption("--repos")
@pytest.fixture
def cluster(request):
return request.config.getoption("--cluster")

View File

@ -2,7 +2,10 @@ apiVersion: batch/v1
kind: Job kind: Job
metadata: metadata:
name: xgboost-test name: xgboost-test
labels:
app: xgboost-synthetic-test
spec: spec:
backoffLimit: 1
template: template:
metadata: metadata:
annotations: annotations:
@ -11,7 +14,7 @@ spec:
# in notebooks? # in notebooks?
sidecar.istio.io/inject: "false" sidecar.istio.io/inject: "false"
labels: labels:
app: xgboost-synthetics-testing app: xgboost-synthetic-test
spec: spec:
restartPolicy: Never restartPolicy: Never
securityContext: securityContext:
@ -22,20 +25,11 @@ spec:
- /usr/local/bin/checkout_repos.sh - /usr/local/bin/checkout_repos.sh
- --repos=kubeflow/examples@$(CHECK_TAG) - --repos=kubeflow/examples@$(CHECK_TAG)
- --src_dir=/src - --src_dir=/src
# TODO(jlewi): Do we need to do depth all here?
- --depth=all
name: checkout name: checkout
# TODO(jlewi): Set in kustomization.yaml?
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4 image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
volumeMounts: volumeMounts:
- mountPath: /src - mountPath: /src
name: src name: src
env:
- name: CHECK_TAG
valueFrom:
configMapKeyRef:
name: xgb-notebooks-tests
key: checkTag
containers: containers:
- name: executing-notebooks - name: executing-notebooks
image: execute-image image: execute-image

View File

@ -1,37 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app: xgboost-synthetics-testing
name: xgboost-synthetics-testing-role
rules:
- apiGroups:
- ""
resources:
- pods
- pods/log
- secrets
- services
verbs:
- '*'
- apiGroups:
- ""
- apps
- extensions
resources:
- deployments
- replicasets
verbs:
- '*'
- apiGroups:
- kubeflow.org
resources:
- '*'
verbs:
- '*'
- apiGroups:
- batch
resources:
- jobs
verbs:
- '*'

View File

@ -1,36 +1,27 @@
import datetime import datetime
import logging import logging
import os import os
import time
import uuid import uuid
import yaml import yaml
import pytest import pytest
from kubernetes import client as k8s_client from kubernetes import client as k8s_client
from kubernetes.client import rest from kubeflow.testing import argo_build_util
from kubeflow.testing import util from kubeflow.testing import util
# TODO(jlewi): This test is currently failing because various things # TODO(jlewi): This test is currently failing because various things
# need to be updated to work with 0.7.0. Until that's fixed we mark it # need to be updated to work with 0.7.0. Until that's fixed we mark it
# as expected to fail so we can begin to get signal. # as expected to fail on presubmits. We only mark it as expected to fail
@pytest.mark.xfail # on presubmits because if expected failures don't show up in test grid
def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pylint: disable=too-many-branches,too-many-statements # and we want signal in postsubmits and periodics
repos, image, app_dir): @pytest.mark.xfail(os.getenv("JOB_TYPE") == "presubmit", reason="Flaky")
def test_xgboost_synthetic(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements
repos, image):
'''Generate Job and summit.''' '''Generate Job and summit.'''
util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic") util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")
app_dir = os.path.abspath(app_dir)
if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.maybe_activate_service_account()
util.run(['gcloud', 'auth', 'activate-service-account',
"--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")],
cwd=app_dir)
# TODO(jlewi): We should just assume that kubeconfig has been set.
if cluster:
util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
"clusters", "get-credentials", "--zone=us-east1-b", cluster],
cwd=app_dir)
with open("job.yaml") as hf: with open("job.yaml") as hf:
job = yaml.load(hf) job = yaml.load(hf)
@ -42,16 +33,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
# See # See
# https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
if not repos: if not repos:
version = "@HEAD" repos = argo_build_util.get_repo_from_prow_env()
if os.getenv("PULL_NUMBER"):
version = "@{0}:{1}".format(os.getenv("PULL_PULL_SHA"),
os.getenv("PULL_NUMBER"))
else:
if os.getenv("PULL_BASE_SHA"):
version = "@{0}".format(os.getenv("PULL_BASE_SHA"))
repos = "kubeflow/examples" + version
logging.info("Repos set to %s", repos) logging.info("Repos set to %s", repos)
job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
@ -69,6 +51,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
job["metadata"]["name"] = ("xgboost-test-" + job["metadata"]["name"] = ("xgboost-test-" +
datetime.datetime.now().strftime("%H%M%S") datetime.datetime.now().strftime("%H%M%S")
+ "-" + uuid.uuid4().hex[0:3]) + "-" + uuid.uuid4().hex[0:3])
name = job["metadata"]["name"]
job["metadata"]["namespace"] = namespace job["metadata"]["namespace"] = namespace
@ -77,43 +60,24 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
batch_api = k8s_client.BatchV1Api(api_client) batch_api = k8s_client.BatchV1Api(api_client)
logging.info("Creating job:\n%s", yaml.dump(job)) logging.info("Creating job:\n%s", yaml.dump(job))
batch_api.create_namespaced_job(job["metadata"]["namespace"], job) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
logging.info("Created job %s in namespaces %s", name, namespace) job)
logging.info("Created job %s.%s:\n%s", namespace, name,
yaml.safe_dump(actual_job))
# Wait for job. final_job = util.wait_for_job(api_client, namespace, name,
end_time = datetime.datetime.now() + datetime.timedelta( timeout=datetime.timedelta(minutes=30))
minutes=15)
last_condition = None logging.info("Final job:\n%s", yaml.safe_dump(final_job))
while datetime.datetime.now() < end_time:
try:
job = batch_api.read_namespaced_job(name, namespace)
except rest.ApiException as e:
logging.error("There was a problem getting job %s.%s; %s",
namespace, name, e)
time.sleep(10)
continue
# ready_replicas could be None
if not job.conditions:
logging.info("Job missing condition")
time.sleep(10)
continue
last_condition = job.conditions[-1] if not job.status.conditions:
if last_condition["type"] in ["Failed", "Complete"]: raise RuntimeError("Job {0}.{1}; did not complete".format(namespace, name))
break
logging.info("Waiting for job %s.%s", namespace, name)
time.sleep(10)
logging.info("Final Job spec:\n%s", yaml.safe_dump(job)) last_condition = job.status.conditions[-1]
util.run(["kubectl", "describe", "job", "-n", namespace, name])
if not last_condition or last_condition["type"] not in ["Failed", "Complete"]: if last_condition.type not in ["Complete"]:
logging.error("Timeout waiting for job %s.%s to finish.", namespace, name) logging.error("Job didn't complete successfully")
raise RuntimeError("Job {0}.{1} has last condition {2} which is not " raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
"Complete".format(namespace, name,
last_condition["type"] in ["Failed", "Complete"]))
assert last_condition["type"] == "Complete"
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, logging.basicConfig(level=logging.INFO,