Upgrade E2E tests for tkn CLI version 0.11 (#246)

* Now including logs for 'finally' tasks * Add env var for SLEEP_BETWEEN_TEST_PHASES * Add 'retry.yaml' to ignored tests * Make node_selector more general * Use pipelinerun status.conditions[].reason instead of .type * Treat status "Completed" as "Succeeded"
2020-08-04 13:26:22 -07:00 · 2020-08-04 13:26:22 -07:00 · c37c600486
parent b28c2cbca6
commit c37c600486
16 changed files with 92 additions and 50 deletions
--- a/19
+++ b/19
@ -19,8 +19,8 @@ VENV ?= .venv
 export VIRTUAL_ENV := $(abspath ${VENV})
 export PATH := ${VIRTUAL_ENV}/bin:${PATH}

-TKN_PIPELINE_VERSION ?= "0.14"
-TKN_CLIENT_VERSION ?= "0.10"
+TKN_PIPELINE_VERSION ?= "0.14."
+TKN_CLIENT_VERSION ?= "0.11."

 .PHONY: help
 help: ## Display the Make targets
@ -30,6 +30,7 @@ help: ## Display the Make targets
 .PHONY: venv
 venv: $(VENV)/bin/activate ## Create and activate virtual environment
 $(VENV)/bin/activate: sdk/python/setup.py
+	@echo "VENV=$(VENV)"
 	@test -d $(VENV) || python3 -m venv $(VENV)
 	pip install -e sdk/python
 	@touch $(VENV)/bin/activate
@ -40,15 +41,23 @@ install: venv ## Install the kfp_tekton compiler in a virtual environment

 .PHONY: unit_test
 unit_test: venv ## Run compiler unit tests
+	@echo "=================================================================="
+	@echo "Optional environment variables to configure $@, examples:"
+	@sed -n -e 's/# *\(make $@ .*\)/  \1/p' sdk/python/tests/compiler/compiler_tests.py
+	@echo "=================================================================="
 	@sdk/python/tests/run_tests.sh
 	@echo "$@: OK"

 .PHONY: e2e_test
 e2e_test: venv ## Run compiler end-to-end tests (requires kubectl and tkn CLI)
-	@which kubectl || (echo "Missing kubectl CLI" && exit 1)
-	@test -z "${KUBECONFIG}" && echo "KUBECONFIG not set" && exit 1 || echo "${KUBECONFIG}"
+	@echo "=================================================================="
+	@echo "Optional environment variables to configure $@, examples:"
+	@sed -n -e 's/# *\(make $@ .*\)/  \1/p' sdk/python/tests/compiler/compiler_tests_e2e.py
+	@echo "=================================================================="
+	@which kubectl > /dev/null || (echo "Missing kubectl CLI" && exit 1)
+	@test -z "${KUBECONFIG}" && echo "KUBECONFIG not set" && exit 1 || echo "KUBECONFIG: ${KUBECONFIG}"
 	@kubectl version --short || (echo "Failed to access kubernetes cluster" && exit 1)
-	@which tkn && tkn version || (echo "Missing tkn CLI" && exit 1)
+	@which tkn > /dev/null || (echo "Missing tkn CLI" && exit 1)
 	@tkn version | grep "Pipeline version: v$${TKN_PIPELINE_VERSION}" || (echo "Required Tekton Pipeline version: $${TKN_PIPELINE_VERSION}" && exit 1)
 	@tkn version | grep "Client version: $${TKN_CLIENT_VERSION}" || (echo "Required tkn CLI version: $${TKN_CLIENT_VERSION}" && exit 1)
 	@sdk/python/tests/run_e2e_tests.sh
--- a/sdk/README.md
+++ b/sdk/README.md
@ -51,7 +51,7 @@ SDK provides a `TektonCompiler` and a `TektonClient`:

 - Python: `3.5.3` or later
 - Tekton: [`0.14.0`](https://github.com/tektoncd/pipeline/releases/tag/v0.14.0)
- - Tekton CLI: [`0.10.0`](https://github.com/tektoncd/cli/releases/tag/v0.10.0)
+ - Tekton CLI: [`0.11.0`](https://github.com/tektoncd/cli/releases/tag/v0.11.0)
 - Kubeflow Pipelines: [KFP with Tekton backend](/tekton_kfp_guide.md)

 Follow the instructions for [installing project prerequisites](/sdk/python/README.md#development-prerequisites)
--- a/sdk/python/tests/compiler/compiler_tests.py
+++ b/sdk/python/tests/compiler/compiler_tests.py
@ -31,9 +31,9 @@ from kfp_tekton import compiler
 # temporarily set this flag to True in order to (re)generate new "golden" YAML
 # files after making code changes that modify the expected YAML output.
 # to (re)generate all "golden" YAML files from the command line run:
-#   GENERATE_GOLDEN_YAML=True sdk/python/tests/run_tests.sh
+#    GENERATE_GOLDEN_YAML=True sdk/python/tests/run_tests.sh
 # or:
-#   make test GENERATE_GOLDEN_YAML=True
+#    make unit_test GENERATE_GOLDEN_YAML=True
 GENERATE_GOLDEN_YAML = env.get("GENERATE_GOLDEN_YAML", "False") == "True"

 if GENERATE_GOLDEN_YAML:
--- a/sdk/python/tests/compiler/compiler_tests_e2e.py
+++ b/sdk/python/tests/compiler/compiler_tests_e2e.py
@ -43,11 +43,11 @@ if env.get("TKN_PIPELINE_VERSION"):
    logging.warning("The environment variable 'TKN_PIPELINE_VERSION' was set to '{}'"
                    .format(TKN_PIPELINE_VERSION))

-# set or override th Tekton CLI version, default "0.10.x":
-#    TKN_CLIENT_VERSION=0.10 sdk/python/tests/run_e2e_tests.sh
+# set or override th Tekton CLI version, default "0.11.x":
+#    TKN_CLIENT_VERSION=0.11 sdk/python/tests/run_e2e_tests.sh
 # or:
-#    make e2e_test TKN_CLIENT_VERSION=0.10
-TKN_CLIENT_VERSION = env.get("TKN_CLIENT_VERSION", "0.10.")
+#    make e2e_test TKN_CLIENT_VERSION=0.11
+TKN_CLIENT_VERSION = env.get("TKN_CLIENT_VERSION", "0.11.")

 # let the user know the expected Tekton CLI version
 if env.get("TKN_CLIENT_VERSION"):
@ -115,6 +115,23 @@ if EXCLUDE_TESTS:
 # KEEP_FAILED_PIPELINERUNS = env.get("KEEP_FAILED_PIPELINERUNS", "False") == "True"


+# Set SLEEP_BETWEEN_TEST_PHASES=<seconds> (default: 5) to increase or decrease
+# the sleep time between the test stages of starting a pipelinerun, then first
+# attempting to get the pipelinerun status, and lastly to get the pipelinerun
+# logs. Increase the sleep for under-powered Kubernetes clusters. The minimal
+# recommended configuration for K8s clusters is 4 cores, 2 nodes, 16 GB RAM:
+#    SLEEP_BETWEEN_TEST_PHASES=10 sdk/python/tests/run_e2e_tests.sh
+# or:
+#    make e2e_test SLEEP_BETWEEN_TEST_PHASES=10
+SLEEP_BETWEEN_TEST_PHASES = int(env.get("SLEEP_BETWEEN_TEST_PHASES", "5"))
+
+# let the user know this test run is not performing any verification
+if env.get("SLEEP_BETWEEN_TEST_PHASES"):
+    logging.warning(
+        "The environment variable 'SLEEP_BETWEEN_TEST_PHASES' was set to '{}'. "
+        "Default is '5' seconds. Increasing this value should improve the test "
+        "success rate on a slow Kubernetes cluster.".format(SLEEP_BETWEEN_TEST_PHASES))
+
 # set RERUN_FAILED_TESTS_ONLY=True, to only re-run those E2E tests that failed in
 # the previous test run:
 #    RERUN_FAILED_TESTS_ONLY=True sdk/python/tests/run_e2e_tests.sh
@ -151,6 +168,7 @@ if RERUN_FAILED_TESTS_ONLY:
 ignored_yaml_files = [
    "big_data_passing.yaml",    # does not complete in a reasonable time frame
    "katib.yaml",               # service account needs Katib permission, takes too long doing 9 trail runs
+    "retry.yaml",               # designed to occasionally fail (randomly) if number of retries exceeded
    "timeout.yaml",             # random failure (by design) ... would need multiple golden log files to compare to
    "tolerations.yaml",         # designed to fail, test show only how to add the toleration to the pod
    "volume.yaml",              # need to rework the credentials part
@ -259,7 +277,7 @@ class TestCompilerE2E(unittest.TestCase):
        run(del_cmd.split(), capture_output=True, timeout=10, check=False)
        # TODO: find a better way than to sleep, but some PipelineRuns cannot
        #   be recreated right after the previous pipelineRun has been deleted
-        sleep(5)
+        sleep(SLEEP_BETWEEN_TEST_PHASES)

    def _start_pipelinerun(self, yaml_file):
        kube_cmd = "kubectl apply -f \"{}\" -n {}".format(yaml_file, namespace)
@ -268,12 +286,13 @@ class TestCompilerE2E(unittest.TestCase):
                         "Process returned non-zero exit code: {} -> {}".format(
                             kube_cmd, kube_proc.stderr))
        # TODO: find a better way than to sleep, but some PipelineRuns take longer
-        #   to be created and logs may not be available yet even with --follow
-        sleep(5)
+        #   to be created and logs may not be available yet even with --follow or
+        #   when attempting (and retrying) to get the pipelinerun status
+        sleep(SLEEP_BETWEEN_TEST_PHASES)

    def _get_pipelinerun_status(self, name, retries: int = 10) -> str:
        tkn_status_cmd = "tkn pipelinerun describe %s -n %s -o jsonpath=" \
-                         "'{.status.conditions[0].type}'" % (name, namespace)
+                         "'{.status.conditions[0].reason}'" % (name, namespace)
        status = "Unknown"
        for i in range(0, retries):
            try:
@ -281,9 +300,9 @@ class TestCompilerE2E(unittest.TestCase):
                                      timeout=10, check=False)
                if tkn_status_proc.returncode == 0:
                    status = tkn_status_proc.stdout.decode("utf-8").strip("'")
-                    if "Succeeded" in status or "Failed" in status:
+                    if status in ["Succeeded", "Completed", "Failed"]:
                        return status
-                    logging.warning("tkn pipeline '{}' {} ({}/{})".format(
+                    logging.debug("tkn pipeline '{}' status: {} ({}/{})".format(
                        name, status, i + 1, retries))
                else:
                    logging.error("Could not get pipelinerun status ({}/{}): {}".format(
@ -291,11 +310,11 @@ class TestCompilerE2E(unittest.TestCase):
            except SubprocessError:
                logging.exception("Error trying to get pipelinerun status ({}/{})".format(
                        i + 1, retries))
-            sleep(3)
+            sleep(SLEEP_BETWEEN_TEST_PHASES)
        return status

    def _get_pipelinerun_logs(self, name, timeout: int = 30) -> str:
-        sleep(10)  # if we don't wait, we often only get logs of some pipeline tasks
+        sleep(SLEEP_BETWEEN_TEST_PHASES * 2)  # if we don't wait, we often only get logs of some pipeline tasks
        tkn_logs_cmd = "tkn pipelinerun logs {} -n {}".format(name, namespace)
        tkn_logs_proc = run(tkn_logs_cmd.split(), capture_output=True, timeout=timeout, check=False)
        self.assertEqual(tkn_logs_proc.returncode, 0,
@ -311,9 +330,11 @@ class TestCompilerE2E(unittest.TestCase):
            try:
                with open(golden_log_file, 'r') as f:
                    golden_log = f.read()
+                sanitized_golden_log = self._sanitize_log(golden_log)
+                sanitized_test_log = self._sanitize_log(test_log)
                self.maxDiff = None
-                self.assertEqual(self._sanitize_log(golden_log),
-                                 self._sanitize_log(test_log),
+                self.assertEqual(sanitized_golden_log,
+                                 sanitized_test_log,
                                 msg="PipelineRun '{}' did not produce the expected "
                                     " log output: {}".format(name, golden_log_file))
            except FileNotFoundError:
@ -344,24 +365,26 @@ class TestCompilerE2E(unittest.TestCase):
        # server process receiving a termination signal
        lines_to_remove = [
            "Pipeline still running ...",
+            "Server is listening on",
            "Unknown signal terminated",
            r"Total: .+, Transferred: .+, Speed: .+",
            r"localhost:.*GET / HTTP",
        ]

-        # replacements are used on multi-line strings, so use '...\n' as opposed to '...$' to denote end of line
+        # replacements are used on multi-line strings, so '...\n' will be matched by '...$'
        replacements = [
            (r"(-[-0-9a-z]{3}-[-0-9a-z]{5})(?=[ -/\]\"]|$)", r"-XXX-XXXXX"),
            (r"uid:[0-9a-z]{8}(-[0-9a-z]{4}){3}-[0-9a-z]{12}",
             "uid:{}-{}-{}-{}-{}".format("X" * 8, "X" * 4, "X" * 4, "X" * 4, "X" * 12)),
+            (r"resourceVersion:[0-9]+ ", "resourceVersion:-------- "),
            (r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", "DATETIME"),
            (r"{}".format("|".join(_MONTHNAMES)), "MONTH"),
            (r"{}".format("|".join(_DAYNAMES)), "DAY"),
            (r"\d", "-"),
+            (r" +$", ""),
+            (r" +\r", r"\n"),
            (r"^$\n", ""),
            (r"\n^$", ""),
-            (r" +\n", ""),
-            (r" +\r", ""),
        ]

        sanitized_log = log
@ -385,7 +408,7 @@ class TestCompilerE2E(unittest.TestCase):

    def _run_test__verify_pipelinerun_success(self, name):
        status = self._get_pipelinerun_status(name)
-        self.assertEqual("Succeeded", status)
+        self.assertIn(status, ["Succeeded", "Completed"])

    def _run_test__verify_pipelinerun_logs(self, name, log_file):
        test_log = self._get_pipelinerun_logs(name)
--- a/sdk/python/tests/compiler/testdata/affinity.py
+++ b/sdk/python/tests/compiler/testdata/affinity.py
@ -37,7 +37,7 @@ def affinity_pipeline(
            required_during_scheduling_ignored_during_execution=V1NodeSelector(
                node_selector_terms=[V1NodeSelectorTerm(
                    match_expressions=[V1NodeSelectorRequirement(
-                        key='beta.kubernetes.io/os',
+                        key='kubernetes.io/os',
                        operator='In',
                        values=['linux'])])])))
    echo_op().add_affinity(affinity)
--- a/sdk/python/tests/compiler/testdata/affinity.yaml
+++ b/sdk/python/tests/compiler/testdata/affinity.yaml
@ -43,7 +43,7 @@ spec:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
-              - key: beta.kubernetes.io/os
+              - key: kubernetes.io/os
                operator: In
                values:
                - linux
--- a/sdk/python/tests/compiler/testdata/basic_no_decorator.log
+++ b/sdk/python/tests/compiler/testdata/basic_no_decorator.log
@ -1,7 +1,8 @@
 [get-frequent : main] flies

 [save : main] Copying file:///tmp/results.txt...
-[save : main] / [0 files][    0.0 B/    7.0 B]                                                
-/ [1 files][    7.0 B/    7.0 B]                                                
+[save : main] / [0 files][    0.0 B/    7.0 B]                                                
/ [1 files][    7.0 B/    7.0 B]                                                
 [save : main] Operation completed over 1 objects/7.0 B.                                        

+[exiting : main] exit!
+
--- a/sdk/python/tests/compiler/testdata/basic_no_decorator.py
+++ b/sdk/python/tests/compiler/testdata/basic_no_decorator.py
@ -84,9 +84,7 @@ def save_most_frequent_word():
        output_path=output_path_param)
    saver.container.set_cpu_limit('0.5')
    # saver.container.set_gpu_limit('2')
-    saver.add_node_selector_constraint(
-        'failure-domain.beta.kubernetes.io/region',
-        'us-south')
+    saver.add_node_selector_constraint('kubernetes.io/os', 'linux')
    # saver.apply(gcp.use_tpu(tpu_cores=2, tpu_resource='v2', tf_version='1.12'))


--- a/sdk/python/tests/compiler/testdata/basic_no_decorator.yaml
+++ b/sdk/python/tests/compiler/testdata/basic_no_decorator.yaml
@ -96,4 +96,4 @@ spec:
  - pipelineTaskName: save
    taskPodTemplate:
      nodeSelector:
-        failure-domain.beta.kubernetes.io/region: us-south
+        kubernetes.io/os: linux
--- a/sdk/python/tests/compiler/testdata/compose.log
+++ b/sdk/python/tests/compiler/testdata/compose.log
@ -4,7 +4,7 @@
 [download : copy-artifacts] tar: removing leading '/' from member names
 [download : copy-artifacts] tekton/results/downloaded
 [download : copy-artifacts] `downloaded.tgz` -> `storage/mlpipeline/artifacts/download-and-save-most-frequent/download/downloaded.tgz`
-[download : copy-artifacts] Total: 0 B, Transferred: 200 B, Speed: 14.79 KiB/s
+[download : copy-artifacts] Total: 0 B, Transferred: 200 B, Speed: 14.17 KiB/s

 [get-frequent : main] your

@ -12,5 +12,9 @@
 [get-frequent : copy-artifacts] tar: removing leading '/' from member names
 [get-frequent : copy-artifacts] tekton/results/word
 [get-frequent : copy-artifacts] `word.tgz` -> `storage/mlpipeline/artifacts/download-and-save-most-frequent/get-frequent/word.tgz`
-[get-frequent : copy-artifacts] Total: 0 B, Transferred: 116 B, Speed: 9.30 KiB/s
+[get-frequent : copy-artifacts] Total: 0 B, Transferred: 117 B, Speed: 2.90 KiB/s
+
+[save : main] Copying file:///tmp/results.txt...
+[save : main] / [0 files][    0.0 B/    6.0 B]                                                
/ [1 files][    6.0 B/    6.0 B]                                                
+[save : main] Operation completed over 1 objects/6.0 B.                                        

--- a/sdk/python/tests/compiler/testdata/exit_handler.log
+++ b/sdk/python/tests/compiler/testdata/exit_handler.log
@ -4,8 +4,10 @@
 [gcs-download : copy-artifacts] tar: removing leading '/' from member names
 [gcs-download : copy-artifacts] tekton/results/data
 [gcs-download : copy-artifacts] `data.tgz` -> `storage/mlpipeline/artifacts/exit-handler/gcs-download/data.tgz`
-[gcs-download : copy-artifacts] Total: 0 B, Transferred: 195 B, Speed: 16.39 KiB/s
+[gcs-download : copy-artifacts] Total: 0 B, Transferred: 195 B, Speed: 14.59 KiB/s

 [echo-2 : main] With which he yoketh your rebellious necks Razeth your cities and subverts your towns And in a moment makes them desolate
 [echo-2 : main] 

+[echo : main] exit!
+
--- a/sdk/python/tests/compiler/testdata/node_selector.py
+++ b/sdk/python/tests/compiler/testdata/node_selector.py
@ -32,8 +32,8 @@ def node_selector_pipeline(
 ):
    """A pipeline with Node Selector"""
    echo_op().add_node_selector_constraint(
-        label_name='beta.kubernetes.io/instance-type',
-        value='b3c.4x16.encrypted')
+        label_name='kubernetes.io/os',
+        value='linux')


 if __name__ == '__main__':
--- a/sdk/python/tests/compiler/testdata/node_selector.yaml
+++ b/sdk/python/tests/compiler/testdata/node_selector.yaml
@ -39,4 +39,4 @@ spec:
  - pipelineTaskName: echo
    taskPodTemplate:
      nodeSelector:
-        beta.kubernetes.io/instance-type: b3c.4x16.encrypted
+        kubernetes.io/os: linux
--- a/sdk/python/tests/compiler/testdata/pipelineparams.log
+++ b/sdk/python/tests/compiler/testdata/pipelineparams.log
@ -4,12 +4,14 @@
 [download : main] '/tekton/results/downloaded-resultoutput' saved

 [download : copy-artifacts] Added `storage` successfully.
-[download : copy-artifacts] tar: removing leading '/' from member names
 [download : copy-artifacts] tekton/results/downloaded-resultoutput
+[download : copy-artifacts] tar: removing leading '/' from member names
 [download : copy-artifacts] `downloaded_resultOutput.tgz` -> `storage/mlpipeline/artifacts/pipelineparams/download/downloaded_resultOutput.tgz`
-[download : copy-artifacts] Total: 0 B, Transferred: 136 B, Speed: 12.10 KiB/s
+[download : copy-artifacts] Total: 0 B, Transferred: 138 B, Speed: 11.44 KiB/s

-[download : sidecar-echo] 2020/07/13 10:38:44 Server is listening on :5678
-[download : sidecar-echo] 2020/07/13 10:38:56 localhost:5678 127.0.0.1:55050 "GET / HTTP/1.1" 200 14 "Wget" 39.277µs
-[download : sidecar-echo] 2020/07/13 10:38:59 [ERR] Unknown signal terminated
+[download : sidecar-echo] 2020/07/31 09:29:31 Server is listening on :5678
+[download : sidecar-echo] 2020/07/31 09:29:43 localhost:5678 127.0.0.1:54892 "GET / HTTP/1.1" 200 14 "Wget" 15.199µs
+[download : sidecar-echo] 2020/07/31 09:29:45 [ERR] Unknown signal terminated
+
+[echo : main] pipelineParams: hello world

--- a/sdk/python/tests/compiler/testdata/retry.py
+++ b/sdk/python/tests/compiler/testdata/retry.py
@ -21,7 +21,8 @@ def random_failure_op(exit_codes):
        name='random_failure',
        image='python:alpine3.6',
        command=['python', '-c'],
-        arguments=['import random; import sys; exit_code = random.choice([int(i) for i in sys.argv[1].split(",")]); '
+        arguments=['import random; import sys; '
+                   'exit_code = random.choice([int(i) for i in sys.argv[1].split(",")]); '
                   'print(exit_code); sys.exit(exit_code)', exit_codes]
    )

--- a/sdk/python/tests/compiler/testdata/sidecar.log
+++ b/sdk/python/tests/compiler/testdata/sidecar.log
@ -7,9 +7,11 @@
 [download : copy-artifacts] tar: removing leading '/' from member names
 [download : copy-artifacts] tekton/results/downloaded
 [download : copy-artifacts] `downloaded.tgz` -> `storage/mlpipeline/artifacts/sidecar/download/downloaded.tgz`
-[download : copy-artifacts] Total: 0 B, Transferred: 128 B, Speed: 10.89 KiB/s
+[download : copy-artifacts] Total: 0 B, Transferred: 130 B, Speed: 12.63 KiB/s

-[download : sidecar-echo] 2020/07/13 10:40:23 Server is listening on :5678
-[download : sidecar-echo] 2020/07/13 10:40:35 localhost:5678 127.0.0.1:38196 "GET / HTTP/1.1" 200 14 "Wget" 38.547µs
-[download : sidecar-echo] 2020/07/13 10:40:37 [ERR] Unknown signal terminated
+[download : sidecar-echo] 2020/07/31 09:31:18 Server is listening on :5678
+[download : sidecar-echo] 2020/07/31 09:31:30 localhost:5678 127.0.0.1:38090 "GET / HTTP/1.1" 200 14 "Wget" 41.526µs
+[download : sidecar-echo] 2020/07/31 09:31:33 [ERR] Unknown signal terminated
+
+[echo : main] hello world