Fix the xgboost_synthetic test so it actually runs and produces signal (#674)

* Fix the xgboost_synthetic test so it actually runs and produces signal * The test wasn't actually running because we were passing arguments that were unknown to pytest * Remove the old role.yaml; we don't use it anymore * Wait for the Job to finish and properly report status; kubeflow/testing#514 contains the new routine * The test still isn't passing because of kubeflow/examples#673 * In addition we need to fix the auto deployments kubeflow/testing#444 Related to kubeflow/examples#665 * Fix lint.
2019-11-04 21:56:38 -08:00 · 2019-11-04 21:56:38 -08:00 · e2198ce1e8
parent 452aa428b6
commit e2198ce1e8
5 changed files with 28 additions and 116 deletions
--- a/py/kubeflow/examples/create_e2e_workflow.py
+++ b/py/kubeflow/examples/create_e2e_workflow.py
@ -238,11 +238,9 @@ class Builder:
    # Test xgboost
    step_name = "xgboost-synthetic"
    command = ["pytest", "xgboost_test.py",
-               # I think -s mean stdout/stderr will print out to aid in debugging.
-               # Failures still appear to be captured and stored in the junit file.
-               "-s",
               # Increase the log level so that info level log statements show up.
               "--log-cli-level=info",
+               "--log-cli-format='%(levelname)s|%(asctime)s|%(pathname)s|%(lineno)d| %(message)'",
               # Test timeout in seconds.
               "--timeout=1800",
               "--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml",
--- a/xgboost_synthetic/testing/conftest.py
+++ b/xgboost_synthetic/testing/conftest.py
@ -14,9 +14,6 @@ def pytest_addoption(parser):
  parser.addoption(
    "--repos", help="The repos to checkout; leave blank to use defaults",
    type=str, default="")
-  parser.addoption(
-    "--cluster", help="The cluster which the applition running in", type=str,
-    default="")

@pytest.fixture
 def name(request):
@ -33,7 +30,3 @@ def image(request):
@pytest.fixture
 def repos(request):
  return request.config.getoption("--repos")
-
-@pytest.fixture
-def cluster(request):
-  return request.config.getoption("--cluster")
--- a/xgboost_synthetic/testing/job.yaml
+++ b/xgboost_synthetic/testing/job.yaml
@ -2,7 +2,10 @@ apiVersion: batch/v1
 kind: Job
 metadata:
  name: xgboost-test
+  labels:
+    app: xgboost-synthetic-test
 spec:
+  backoffLimit: 1
  template:
    metadata:
      annotations:
@ -11,7 +14,7 @@ spec:
        # in notebooks?
        sidecar.istio.io/inject: "false"
      labels:
-        app: xgboost-synthetics-testing
+        app: xgboost-synthetic-test
    spec:
      restartPolicy: Never
      securityContext:
@ -22,20 +25,11 @@ spec:
        - /usr/local/bin/checkout_repos.sh
        - --repos=kubeflow/examples@$(CHECK_TAG)
        - --src_dir=/src
-        # TODO(jlewi): Do we need to do depth all here?
-        - --depth=all
        name: checkout
-        # TODO(jlewi): Set in kustomization.yaml?
        image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
        volumeMounts:
        - mountPath: /src
          name: src
-        env:
-        - name: CHECK_TAG
-          valueFrom:
-            configMapKeyRef:
-              name: xgb-notebooks-tests
-              key: checkTag
      containers:
      - name: executing-notebooks
        image: execute-image
--- a/xgboost_synthetic/testing/role.yaml
+++ b/xgboost_synthetic/testing/role.yaml
@ -1,37 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  labels:
-    app: xgboost-synthetics-testing
-  name: xgboost-synthetics-testing-role
-rules:
- apiGroups:
-  - ""
-  resources:
-  - pods
-  - pods/log
-  - secrets
-  - services
-  verbs:
-  - '*'
- apiGroups:
-  - ""
-  - apps
-  - extensions
-  resources:
-  - deployments
-  - replicasets
-  verbs:
-  - '*'
- apiGroups:
-  - kubeflow.org
-  resources:
-  - '*'
-  verbs:
-  - '*'
- apiGroups:
-  - batch
-  resources:
-  - jobs
-  verbs:
-  - '*'
--- a/xgboost_synthetic/testing/xgboost_test.py
+++ b/xgboost_synthetic/testing/xgboost_test.py
@ -1,36 +1,27 @@
 import datetime
 import logging
 import os
-import time
 import uuid
 import yaml

 import pytest

 from kubernetes import client as k8s_client
-from kubernetes.client import rest
+from kubeflow.testing import argo_build_util
 from kubeflow.testing import util

 # TODO(jlewi): This test is currently failing because various things
 # need to be updated to work with 0.7.0. Until that's fixed we mark it
-# as expected to fail so we can begin to get signal.
-@pytest.mark.xfail
-def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pylint: disable=too-many-branches,too-many-statements
-                           repos, image, app_dir):
+# as expected to fail on presubmits. We only mark it as expected to fail
+# on presubmits because if expected failures don't show up in test grid
+# and we want signal in postsubmits and periodics
+@pytest.mark.xfail(os.getenv("JOB_TYPE") == "presubmit", reason="Flaky")
+def test_xgboost_synthetic(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements
+                           repos, image):
  '''Generate Job and summit.'''
  util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")
-  app_dir = os.path.abspath(app_dir)

-  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
-    util.run(['gcloud', 'auth', 'activate-service-account',
-            "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")],
-             cwd=app_dir)
-
-  # TODO(jlewi): We should just assume that kubeconfig has been set.
-  if cluster:
-    util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
-              "clusters", "get-credentials", "--zone=us-east1-b", cluster],
-               cwd=app_dir)
+  util.maybe_activate_service_account()

  with open("job.yaml") as hf:
    job = yaml.load(hf)
@ -42,16 +33,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
  # See
  # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
  if not repos:
-    version = "@HEAD"
-    if os.getenv("PULL_NUMBER"):
-      version = "@{0}:{1}".format(os.getenv("PULL_PULL_SHA"),
-                                  os.getenv("PULL_NUMBER"))
-
-    else:
-      if os.getenv("PULL_BASE_SHA"):
-        version = "@{0}".format(os.getenv("PULL_BASE_SHA"))
-
-    repos = "kubeflow/examples" + version
+    repos = argo_build_util.get_repo_from_prow_env()

  logging.info("Repos set to %s", repos)
  job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
@ -69,6 +51,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
    job["metadata"]["name"] = ("xgboost-test-" +
                               datetime.datetime.now().strftime("%H%M%S")
                               + "-" + uuid.uuid4().hex[0:3])
+    name = job["metadata"]["name"]

  job["metadata"]["namespace"] = namespace

@ -77,43 +60,24 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
  batch_api = k8s_client.BatchV1Api(api_client)

  logging.info("Creating job:\n%s", yaml.dump(job))
-  batch_api.create_namespaced_job(job["metadata"]["namespace"], job)
-  logging.info("Created job %s in namespaces %s", name, namespace)
+  actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
+                                               job)
+  logging.info("Created job %s.%s:\n%s", namespace, name,
+               yaml.safe_dump(actual_job))

-  # Wait for job.
-  end_time = datetime.datetime.now() + datetime.timedelta(
-    minutes=15)
+  final_job = util.wait_for_job(api_client, namespace, name,
+                                timeout=datetime.timedelta(minutes=30))

-  last_condition = None
-  while datetime.datetime.now() < end_time:
-    try:
-      job = batch_api.read_namespaced_job(name, namespace)
-    except rest.ApiException as e:
-      logging.error("There was a problem getting job %s.%s; %s",
-                    namespace, name, e)
-      time.sleep(10)
-      continue
-    # ready_replicas could be None
-    if not job.conditions:
-      logging.info("Job missing condition")
-      time.sleep(10)
-      continue
+  logging.info("Final job:\n%s", yaml.safe_dump(final_job))

-    last_condition = job.conditions[-1]
-    if last_condition["type"] in ["Failed", "Complete"]:
-      break
-    logging.info("Waiting for job %s.%s", namespace, name)
-    time.sleep(10)
+  if not job.status.conditions:
+    raise RuntimeError("Job {0}.{1}; did not complete".format(namespace, name))

-  logging.info("Final Job spec:\n%s", yaml.safe_dump(job))
-  util.run(["kubectl", "describe", "job", "-n", namespace, name])
+  last_condition = job.status.conditions[-1]

-  if not last_condition or last_condition["type"] not in ["Failed", "Complete"]:
-    logging.error("Timeout waiting for job %s.%s to finish.", namespace, name)
-    raise RuntimeError("Job {0}.{1} has last condition {2} which is not "
-                       "Complete".format(namespace, name,
-                       last_condition["type"] in ["Failed", "Complete"]))
-  assert last_condition["type"] == "Complete"
+  if last_condition.type not in ["Complete"]:
+    logging.error("Job didn't complete successfully")
+    raise RuntimeError("Job {0}.{1} failed".format(namespace, name))

 if __name__ == "__main__":
  logging.basicConfig(level=logging.INFO,