fix: add ready check, error log, and manual test (#87)

Signed-off-by: matttrach <matt.trachier@suse.com>
2025-04-11 09:27:58 -05:00 · 2025-04-11 09:27:58 -05:00 · f00468fe00
parent 8c1226f6b7
commit f00468fe00
9 changed files with 190 additions and 14 deletions
--- a/.github/workflows/manual.yaml
+++ b/.github/workflows/manual.yaml
@ -0,0 +1,48 @@
+name: manual
+
+on: workflow_dispatch
+
+env:
+  AWS_REGION: us-west-2
+  AWS_ROLE: arn:aws:iam::270074865685:role/terraform-module-ci-test
+  GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+  ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
+
+permissions: write-all
+
+jobs:
+  test_TestOneBasic:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          token: ${{secrets.GITHUB_TOKEN}}
+          fetch-depth: 0
+      - id: aws-creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{env.AWS_ROLE}}
+          role-session-name: ${{github.run_id}}
+          aws-region: ${{env.AWS_REGION}}
+          role-duration-seconds: 7200 # 2 hours
+          output-credentials: true
+      - name: install-nix
+        run: |
+          curl -L https://nixos.org/nix/install | sh
+          source /home/runner/.nix-profile/etc/profile.d/nix.sh
+          nix --version
+          which nix
+      - name: run_tests
+        shell: '/home/runner/.nix-profile/bin/nix develop --ignore-environment --extra-experimental-features nix-command --extra-experimental-features flakes --keep HOME --keep SSH_AUTH_SOCK --keep IDENTIFIER --keep GITHUB_TOKEN --keep GITHUB_OWNER --keep ZONE --keep AWS_ROLE --keep AWS_REGION --keep AWS_DEFAULT_REGION --keep AWS_ACCESS_KEY_ID --keep AWS_SECRET_ACCESS_KEY --keep AWS_SESSION_TOKEN --keep UPDATECLI_GPGTOKEN --keep UPDATECLI_GITHUB_TOKEN --keep UPDATECLI_GITHUB_ACTOR --keep GPG_SIGNING_KEY --keep NIX_SSL_CERT_FILE --keep NIX_ENV_LOADED --keep TERM --command bash -e {0}'
+        env:
+          AWS_ACCESS_KEY_ID: ${{ steps.aws-creds.outputs.aws-access-key-id }}
+          AWS_SECRET_ACCESS_KEY: ${{ steps.aws-creds.outputs.aws-secret-access-key }}
+          AWS_SESSION_TOKEN: ${{ steps.aws-creds.outputs.aws-session-token }}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+          GITHUB_OWNER: rancher
+          IDENTIFIER: ${{github.run_id}}
+          ZONE: ${{secrets.ZONE}}
+          ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
+          RANCHER_INSECURE: false
+        run: |
+          ./run_tests.sh -t TestOneBasic
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -224,6 +224,49 @@ jobs:
        run: |
          ./run_tests.sh -t TestDownstreamProd

+  test_Cleanup:
+    needs:
+      - release
+      - test_TestOneBasic
+      - test_TestProdBasic
+      - test_TestDownstreamBasic
+      - test_TestDownstreamProd
+    if: needs.release.outputs.release_pr
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          token: ${{secrets.GITHUB_TOKEN}}
+          fetch-depth: 0
+      - id: aws-creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{env.AWS_ROLE}}
+          role-session-name: ${{github.run_id}}
+          aws-region: ${{env.AWS_REGION}}
+          role-duration-seconds: 7200 # 2 hours
+          output-credentials: true
+      - name: install-nix
+        run: |
+          curl -L https://nixos.org/nix/install | sh
+          source /home/runner/.nix-profile/etc/profile.d/nix.sh
+          nix --version
+          which nix
+      - name: cleanup
+        shell: '/home/runner/.nix-profile/bin/nix develop --ignore-environment --extra-experimental-features nix-command --extra-experimental-features flakes --keep HOME --keep SSH_AUTH_SOCK --keep IDENTIFIER --keep GITHUB_TOKEN --keep GITHUB_OWNER --keep ZONE --keep AWS_ROLE --keep AWS_REGION --keep AWS_DEFAULT_REGION --keep AWS_ACCESS_KEY_ID --keep AWS_SECRET_ACCESS_KEY --keep AWS_SESSION_TOKEN --keep UPDATECLI_GPGTOKEN --keep UPDATECLI_GITHUB_TOKEN --keep UPDATECLI_GITHUB_ACTOR --keep GPG_SIGNING_KEY --keep NIX_SSL_CERT_FILE --keep NIX_ENV_LOADED --keep TERM --command bash -e {0}'
+        env:
+          AWS_ACCESS_KEY_ID: ${{ steps.aws-creds.outputs.aws-access-key-id }}
+          AWS_SECRET_ACCESS_KEY: ${{ steps.aws-creds.outputs.aws-secret-access-key }}
+          AWS_SESSION_TOKEN: ${{ steps.aws-creds.outputs.aws-session-token }}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+          GITHUB_OWNER: rancher
+          IDENTIFIER: ${{github.run_id}}
+          ZONE: ${{secrets.ZONE}}
+          ACME_SERVER_URL: https://acme-v02.api.letsencrypt.org/directory
+          RANCHER_INSECURE: false
+        run: |
+          ./run_tests.sh -c $IDENTIFIER
+
  report:
    needs:
      - release
@ -231,6 +274,7 @@ jobs:
      - test_TestProdBasic
      - test_TestDownstreamBasic
      - test_TestDownstreamProd
+      - test_Cleanup
    if: success() && needs.release.outputs.release_pr #Ensure the test jobs succeeded, and that a release PR was created.
    runs-on: ubuntu-latest
    steps:
--- a/modules/rancher_bootstrap/rancher/main.tf
+++ b/modules/rancher_bootstrap/rancher/main.tf
@ -150,10 +150,10 @@ resource "helm_release" "rancher" {
  chart            = "${path.root}/rancher-${local.rancher_version}.tgz" # "${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz"
  namespace        = "cattle-system"
  create_namespace = false
-  wait             = true
-  wait_for_jobs    = true
+  wait             = false
+  wait_for_jobs    = false
  force_update     = true
-  timeout          = 2400 # 40m
+  timeout          = 1800 # 30m

  set {
    name  = "hostname"
--- a/modules/rancher_bootstrap/rancher_externalTLS/main.tf
+++ b/modules/rancher_bootstrap/rancher_externalTLS/main.tf
@ -67,10 +67,10 @@ resource "helm_release" "rancher" {
  chart            = "${path.root}/rancher-${local.rancher_version}.tgz" #"${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz"
  namespace        = "cattle-system"
  create_namespace = false
-  wait             = true
-  wait_for_jobs    = true
+  wait             = false
+  wait_for_jobs    = false
  force_update     = true
-  timeout          = 2400 # 40m
+  timeout          = 1800 # 30m

  set {
    name  = "hostname"
--- a/run_tests.sh
+++ b/run_tests.sh
@ -3,21 +3,28 @@
 rerun_failed=false
 specific_test=""
 specific_package=""
+cleanup_id=""

-while getopts ":r:t:p:" opt; do
+while getopts ":r:t:p:c:" opt; do
  case $opt in
    r) rerun_failed=true ;;
    t) specific_test="$OPTARG" ;;
    p) specific_package="$OPTARG" ;;
+    c) cleanup_id="$OPTARG" ;;
    \?) cat <<EOT >&2 && exit 1 ;;
 Invalid option -$OPTARG, valid options are
  -r to re-run failed tests
  -t to specify a specific test (eg. TestBase)
  -p to specify a specific test package (eg. base)
+  -c to run clean up only with the given id (eg. abc123)
 EOT
  esac
 done

+if [ -n "$cleanup_id" ]; then
+  export IDENTIFIER="$cleanup_id"
+fi
+
 run_tests() {
  local rerun=$1
  REPO_ROOT="$(git rev-parse --show-toplevel)"
@ -99,13 +106,15 @@ if [ -z "$GITHUB_TOKEN" ]; then echo "GITHUB_TOKEN isn't set"; else echo "GITHUB
 if [ -z "$GITHUB_OWNER" ]; then echo "GITHUB_OWNER isn't set"; else echo "GITHUB_OWNER is set"; fi
 if [ -z "$ZONE" ]; then echo "ZONE isn't set"; else echo "ZONE is set"; fi

-# Run tests initially
-run_tests false
+if [ -z "$cleanup_id" ]; then
+  # Run tests initially
+  run_tests false

-# Check if we need to rerun failed tests
-if [ "$rerun_failed" = true ] && [ -f "/tmp/${IDENTIFIER}_failed_tests.txt" ]; then
-  echo "Rerunning failed tests..."
-  run_tests true
+  # Check if we need to rerun failed tests
+  if [ "$rerun_failed" = true ] && [ -f "/tmp/${IDENTIFIER}_failed_tests.txt" ]; then
+    echo "Rerunning failed tests..."
+    run_tests true
+  fi
 fi

 echo "Clearing leftovers with Id $IDENTIFIER in $AWS_REGION..."
--- a/test/scripts/getLogs.sh
+++ b/test/scripts/getLogs.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+kubectl get nodes || true
+kubectl get all -A || true
+
+kubectl get pods -A || true
+sleep 10
+kubectl get pods -A || true
+sleep 10
+kubectl get pods -A || true
--- a/test/scripts/readyNodes.sh
+++ b/test/scripts/readyNodes.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -x

 JSONPATH="'{range .items[*]}
  {.metadata.name}{\"\\t\"} \
@ -46,6 +47,13 @@ while notReady; do
  fi
 done

+echo "Nodes are ready..."
+
+echo "nodes..."
 kubectl get nodes || true
+echo "all..."
 kubectl get all -A || true
+echo "pods..."
+kubectl get pods -A || true
+
 exit 0
--- a/test/tests/one/one_test.go
+++ b/test/tests/one/one_test.go
@ -95,12 +95,19 @@ func TestOneBasic(t *testing.T) {

  _, err = terraform.InitAndApplyE(t, terraformOptions)
  if err != nil {
+    t.Log("Test failed, tearing down...")
+    util.GetErrorLogs(t, testDir + "/kubeconfig")
    util.Teardown(t, testDir, terraformOptions, keyPair)
    os.Remove(exampleDir + ".terraform.lock.hcl")
    sshAgent.Stop()
    t.Fatalf("Error creating cluster: %s", err)
  }
-  t.Log("Test passed, tearing down...")
+  util.CheckReady(t, testDir + "/kubeconfig")
+  if t.Failed() {
+    t.Log("Test failed...")
+  } else {
+    t.Log("Test passed...")
+  }
  util.Teardown(t, testDir, terraformOptions, keyPair)
  os.Remove(exampleDir + ".terraform.lock.hcl")
  sshAgent.Stop()
--- a/test/tests/util.go
+++ b/test/tests/util.go
@ -16,6 +16,7 @@ import (
  aws "github.com/gruntwork-io/terratest/modules/aws"
  g "github.com/gruntwork-io/terratest/modules/git"
  "github.com/gruntwork-io/terratest/modules/random"
+  "github.com/gruntwork-io/terratest/modules/shell"
  "github.com/gruntwork-io/terratest/modules/terraform"
  "golang.org/x/oauth2"
 )
@ -417,3 +418,52 @@ func Teardown(t *testing.T, directory string, options *terraform.Options, keyPai
  }
  aws.DeleteEC2KeyPair(t, keyPair)
 }
+
+func GetErrorLogs(t *testing.T, kubeconfigPath string) {
+  repoRoot, err := filepath.Abs(g.GetRepoRoot(t))
+  if err != nil {
+    t.Logf("Error getting git root directory: %v", err)
+  }
+  script, err := os.ReadFile(repoRoot + "/test/scripts/getLogs.sh")
+	if err != nil {
+		t.Logf("Error reading script: %v", err)
+	}
+	errorLogsScript := shell.Command{
+		Command: "bash",
+		Args:    []string{"-c", string(script)},
+		Env: map[string]string{
+			"KUBECONFIG": kubeconfigPath,
+		},
+	}
+	out, err := shell.RunCommandAndGetOutputE(t, errorLogsScript)
+  if err != nil {
+    t.Logf("Error running script: %s", err)
+  }
+	t.Logf("Log script output: %s", out)
+}
+
+func CheckReady(t *testing.T, kubeconfigPath string) {
+  repoRoot, err := filepath.Abs(g.GetRepoRoot(t))
+  if err != nil {
+    t.Logf("Error getting git root directory: %v", err)
+    t.Fail()
+  }
+  script, err := os.ReadFile(repoRoot + "/test/scripts/readyNodes.sh")
+	if err != nil {
+		t.Logf("Error reading script: %v", err)
+    t.Fail()
+	}
+	readyScript := shell.Command{
+		Command: "bash",
+		Args:    []string{"-c", string(script)},
+		Env: map[string]string{
+			"KUBECONFIG": kubeconfigPath,
+		},
+	}
+	out, err := shell.RunCommandAndGetOutputE(t, readyScript)
+  if err != nil {
+    t.Logf("Error running script: %s", err)
+    t.Fail()
+  }
+	t.Logf("Ready script output: %s", out)
+}