tests: add test for bare-metal with ipv6

IPv6 brings some new complexities, particularly around IPAM.
2024-11-11 11:12:27 -05:00 · 2024-11-11 11:12:27 -05:00 · 6b88da4376
parent 5f564fe1ab
commit 6b88da4376
4 changed files with 377 additions and 28 deletions
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@ -46,3 +46,29 @@ jobs:
        with:
          name: tests-e2e-scenarios-bare-metal
          path: /tmp/artifacts/
  tests-e2e-scenarios-bare-metal-ipv6:
    runs-on: ubuntu-24.04
    timeout-minutes: 70
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          path: ${{ env.GOPATH }}/src/k8s.io/kops
      - name: Set up go
        uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed
        with:
          go-version-file: '${{ env.GOPATH }}/src/k8s.io/kops/go.mod'
      - name: tests/e2e/scenarios/bare-metal/run-test
        working-directory: ${{ env.GOPATH }}/src/k8s.io/kops
        run: |
          timeout 60m tests/e2e/scenarios/bare-metal/scenario-ipv6
        env:
          ARTIFACTS: /tmp/artifacts
      - name: Archive production artifacts
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: tests-e2e-scenarios-bare-metal-ipv6
          path: /tmp/artifacts/
--- a/tests/e2e/scenarios/bare-metal/cleanup
+++ b/tests/e2e/scenarios/bare-metal/cleanup
@ -38,6 +38,8 @@ sudo ip link del dev tap-vm0 || true
 sudo ip link del dev tap-vm1 || true
 sudo ip link del dev tap-vm2 || true
 sudo ip link del dev br0 || true
 rm -rf .build/vm0
 rm -rf .build/vm1
 rm -rf .build/vm2
--- a/tests/e2e/scenarios/bare-metal/run-test
+++ b/tests/e2e/scenarios/bare-metal/run-test
@ -40,30 +40,39 @@ function cleanup() {
  fi
 }
-if [[ -z "${SKIP_CLEANUP:-}" ]]; then
+trap cleanup EXIT
  trap cleanup EXIT
 fi
 # Create the directory that will back our mock s3 storage
 rm -rf ${WORKDIR}/s3
 mkdir -p ${WORKDIR}/s3/
 IPV4_PREFIX=10.123.45.
 VM0_IP=${IPV4_PREFIX}10
 VM1_IP=${IPV4_PREFIX}11
 VM2_IP=${IPV4_PREFIX}12
 # Start our VMs
 ${REPO_ROOT}/tests/e2e/scenarios/bare-metal/start-vms
 # Start an SSH agent; enroll assumes SSH connectivity to the VMs with the key in the agent
 eval $(ssh-agent)
 ssh-add ${REPO_ROOT}/.build/.ssh/id_ed25519
 . hack/dev-build-metal.sh
 echo "Waiting 10 seconds for VMs to start"
 sleep 10
 # Remove from known-hosts in case of reuse
-ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.10 || true
+ssh-keygen -f ~/.ssh/known_hosts -R ${VM0_IP} || true
-ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.11 || true
+ssh-keygen -f ~/.ssh/known_hosts -R ${VM1_IP} || true
-ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.12 || true
+ssh-keygen -f ~/.ssh/known_hosts -R ${VM2_IP} || true
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 uptime
+# Check SSH is working and accept the host keys
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.11 uptime
+ssh -o StrictHostKeyChecking=accept-new root@${VM0_IP} uptime
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.12 uptime
+ssh -o StrictHostKeyChecking=accept-new root@${VM1_IP} uptime
 ssh -o StrictHostKeyChecking=accept-new root@${VM2_IP} uptime
 cd ${REPO_ROOT}
@ -93,7 +102,7 @@ ${KOPS} create cluster --cloud=metal metal.k8s.local --zones main --networking c
 # Set the IP ingress, required for metal cloud
 # TODO: is this the best option?
-${KOPS} edit cluster metal.k8s.local  --set spec.api.publicName=10.123.45.10
+${KOPS} edit cluster metal.k8s.local  --set spec.api.publicName=${VM0_IP}
 # Use latest etcd-manager image (while we're adding features)
 #${KOPS} edit cluster metal.k8s.local --set 'spec.etcdClusters[*].manager.image=us-central1-docker.pkg.dev/k8s-staging-images/etcd-manager/etcd-manager-slim:v3.0.20250628-7-ga7be11fb'
@ -114,28 +123,24 @@ ${KOPS} get ig --name metal.k8s.local -oyaml
 ${KOPS} update cluster metal.k8s.local
 ${KOPS} update cluster metal.k8s.local --yes --admin
 # Start an SSH agent; enroll assumes SSH connectivity to the VMs with the key in the agent
 eval $(ssh-agent)
 ssh-add ${REPO_ROOT}/.build/.ssh/id_ed25519
 # Enroll the control-plane VM
-${KOPS} toolbox enroll --cluster metal.k8s.local --instance-group control-plane-main --host 10.123.45.10 --v=2
+${KOPS} toolbox enroll --cluster metal.k8s.local --instance-group control-plane-main --host ${VM0_IP} --v=2
 # Manual creation of "volumes" for etcd, and setting up peer nodes
-cat <<EOF | ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 tee -a /etc/hosts
+cat <<EOF | ssh root@${VM0_IP} tee -a /etc/hosts
 # Hosts added for etcd discovery
-10.123.45.10 node0.main.metal.k8s.local
+${VM0_IP} node0.main.metal.k8s.local
-10.123.45.10 node0.events.metal.k8s.local
+${VM0_IP} node0.events.metal.k8s.local
 EOF
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 cat /etc/hosts
+ssh root@${VM0_IP} cat /etc/hosts
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 mkdir -p /mnt/disks/metal.k8s.local--main--0/mnt
+ssh root@${VM0_IP} mkdir -p /mnt/disks/metal.k8s.local--main--0/mnt
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 touch    /mnt/disks/metal.k8s.local--main--0/mnt/please-create-new-cluster
+ssh root@${VM0_IP} touch    /mnt/disks/metal.k8s.local--main--0/mnt/please-create-new-cluster
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 mkdir -p /mnt/disks/metal.k8s.local--events--0/mnt
+ssh root@${VM0_IP} mkdir -p /mnt/disks/metal.k8s.local--events--0/mnt
-ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@10.123.45.10 touch    /mnt/disks/metal.k8s.local--events--0/mnt/please-create-new-cluster
+ssh root@${VM0_IP} touch    /mnt/disks/metal.k8s.local--events--0/mnt/please-create-new-cluster
 echo "Waiting for kube to start"
@ -204,18 +209,18 @@ function enroll_node() {
 # Manual "discovery" for control-plane endpoints
 # TODO: Replace with well-known IP
-cat <<EOF | ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 root@${node_ip} tee -a /etc/hosts
+cat <<EOF | ssh root@${node_ip} tee -a /etc/hosts
 # Hosts added for leader discovery
-10.123.45.10 kops-controller.internal.metal.k8s.local
+${VM0_IP} kops-controller.internal.metal.k8s.local
-10.123.45.10 api.internal.metal.k8s.local
+${VM0_IP} api.internal.metal.k8s.local
 EOF
 timeout 10m ${KOPS} toolbox enroll --cluster metal.k8s.local --instance-group nodes-main --host ${node_ip} --v=2
 }
-enroll_node 10.123.45.11
+enroll_node ${VM1_IP}
-enroll_node 10.123.45.12
+enroll_node ${VM2_IP}
 echo "Waiting 30 seconds for nodes to be ready"
 sleep 30
--- a/tests/e2e/scenarios/bare-metal/scenario-ipv6
+++ b/tests/e2e/scenarios/bare-metal/scenario-ipv6
@ -0,0 +1,316 @@
 #!/usr/bin/env bash
 # Copyright 2024 The Kubernetes Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 set -o errexit
 set -o nounset
 set -o pipefail
 set -o xtrace
 REPO_ROOT=$(git rev-parse --show-toplevel)
 cd ${REPO_ROOT}
 WORKDIR=${REPO_ROOT}/.build/
 BINDIR=${WORKDIR}/bin
 mkdir -p "${BINDIR}"
 go build -o ${BINDIR}/kops ./cmd/kops
 export KOPS=${BINDIR}/kops
 function cleanup() {
  echo "running dump-artifacts"
  ${REPO_ROOT}/tests/e2e/scenarios/bare-metal/dump-artifacts || true
  if [[ -z "${SKIP_CLEANUP:-}" ]]; then
    echo "running cleanup"
    ${REPO_ROOT}/tests/e2e/scenarios/bare-metal/cleanup || true
  fi
 }
 trap cleanup EXIT
 # Create the directory that will back our mock s3 storage
 rm -rf ${WORKDIR}/s3
 mkdir -p ${WORKDIR}/s3/
 IPV6_PREFIX=fd00:10:123:45:
 IPV4_PREFIX=10.123.45.
 VM0_IP=${IPV4_PREFIX}10
 VM1_IP=${IPV4_PREFIX}11
 VM2_IP=${IPV4_PREFIX}12
 VM0_IPV6=${IPV6_PREFIX}a::
 VM1_IPV6=${IPV6_PREFIX}b::
 VM2_IPV6=${IPV6_PREFIX}c::
 VM0_POD_CIDR=${IPV6_PREFIX}a::/96
 VM1_POD_CIDR=${IPV6_PREFIX}b::/96
 VM2_POD_CIDR=${IPV6_PREFIX}c::/96
 # Start our VMs
 ${REPO_ROOT}/tests/e2e/scenarios/bare-metal/start-vms
 # Start an SSH agent; enroll assumes SSH connectivity to the VMs with the key in the agent
 eval $(ssh-agent)
 ssh-add ${REPO_ROOT}/.build/.ssh/id_ed25519
 . hack/dev-build-metal.sh
 echo "Waiting 10 seconds for VMs to start"
 sleep 10
 # Remove from known-hosts in case of reuse
 ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.10 || true
 ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.11 || true
 ssh-keygen -f ~/.ssh/known_hosts -R 10.123.45.12 || true
 # Check SSH is working and accept the host keys
 ssh -o StrictHostKeyChecking=accept-new root@${VM0_IP} uptime
 ssh -o StrictHostKeyChecking=accept-new root@${VM1_IP} uptime
 ssh -o StrictHostKeyChecking=accept-new root@${VM2_IP} uptime
 cd ${REPO_ROOT}
 # Configure IPv6 networking
 function configure_ipv6() {
  local hostname=$1
  local node_ip=$2
  local ipv6_ip=$3
  local ipv6_range=$4
  ssh root@${node_ip} ip link
  ssh root@${node_ip} ip -6 addr add ${ipv6_range} dev enp0s3
  ssh root@${node_ip} ip -6 route add default dev enp0s3
  cat <<EOF | ssh root@${node_ip} tee /etc/resolv.conf
 nameserver 8.8.8.8
 nameserver 8.8.4.4
 nameserver 2001:4860:4860::8888
 nameserver 2001:4860:4860::8844
 EOF
  # Ensure /etc/hosts has an entry for the host
  cat <<EOF | ssh root@${node_ip} tee -a /etc/hosts
 ::1 ${hostname} localhost
 EOF
  cat << EOF | ssh root@${node_ip} tee /etc/radvd.conf
 interface enp0s3
 {
  AdvSendAdvert on;
  AdvDefaultLifetime 0; # Not a default router
  route ${ipv6_range}
  {
  };
 };
 EOF
  ssh root@${node_ip} apt-get update
  ssh root@${node_ip} apt-get install -y radvd
  ssh root@${node_ip} systemctl restart radvd
  ssh root@${node_ip} sysctl net.ipv6.conf.enp0s3.accept_ra=2
  ssh root@${node_ip} sysctl net.ipv6.conf.enp0s3.accept_ra_rt_info_max_plen=96
  ssh root@${node_ip} ip -6 addr
  ssh root@${node_ip} ip -6 route
  #sudo ip -6 route add ${ipv6_range} dev br0 via ${ipv6_ip}
 }
 # Configure our IPv6 addresses on the bridge
 sudo ip address add ${IPV6_PREFIX}0::/96 dev br0 || true
 sudo sysctl net.ipv6.conf.br0.accept_ra=2
 sudo sysctl net.ipv6.conf.br0.accept_ra_rt_info_max_plen=96
 # Configure the VMs on the bridge
 configure_ipv6 vm0 ${VM0_IP} ${VM0_IPV6} ${VM0_POD_CIDR}
 configure_ipv6 vm1 ${VM1_IP} ${VM1_IPV6} ${VM1_POD_CIDR}
 configure_ipv6 vm2 ${VM2_IP} ${VM2_IPV6} ${VM2_POD_CIDR}
 ip -6 route
 # Check the VMs are OK
 ping6 -c 1 ${VM0_IPV6}
 ping6 -c 1 ${VM1_IPV6}
 ping6 -c 1 ${VM2_IPV6}
 # Enable feature flag for bare metal
 export KOPS_FEATURE_FLAGS=Metal
 # Set up the AWS credentials
 export AWS_SECRET_ACCESS_KEY=secret
 export AWS_ACCESS_KEY_ID=accesskey
 export AWS_ENDPOINT_URL=http://10.123.45.1:8443
 export AWS_REGION=us-east-1
 export S3_ENDPOINT=${AWS_ENDPOINT_URL}
 export S3_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
 export S3_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
 # Create the state-store bucket in our mock s3 server
 export KOPS_STATE_STORE=s3://kops-state-store/
 aws --version
 aws s3 ls s3://kops-state-store || aws s3 mb s3://kops-state-store
 export CLUSTER_NAME=metalipv6.k8s.local
 # List clusters (there should not be any yet)
 ${KOPS} get cluster || true
 # Create a cluster
 ${KOPS} create cluster --cloud=metal ${CLUSTER_NAME} --zones main --networking cni --ipv6
 # Set the IP ingress, required for metal cloud
 # TODO: is this the best option?
 ${KOPS} edit cluster ${CLUSTER_NAME} --set spec.api.publicName=${VM0_IPV6}
 # Use latest etcd-manager image (while we're adding features)
 ${KOPS} edit cluster ${CLUSTER_NAME} --set 'spec.etcdClusters[*].manager.image=us-central1-docker.pkg.dev/k8s-staging-images/etcd-manager/etcd-manager-static:latest'
 # Use 1.31 kubernetes so we get kube-apiserver fixes
 export KOPS_RUN_TOO_NEW_VERSION=1
 "${KOPS}" edit cluster ${CLUSTER_NAME} "--set=cluster.spec.kubernetesVersion=1.31.0"
 # List clusters
 ${KOPS} get cluster
 ${KOPS} get cluster -oyaml
 # List instance groups
 ${KOPS} get ig --name ${CLUSTER_NAME}
 ${KOPS} get ig --name ${CLUSTER_NAME} -oyaml
 # Apply basic configuration
 ${KOPS} update cluster ${CLUSTER_NAME}
 ${KOPS} update cluster ${CLUSTER_NAME} --yes --admin
 # Enroll the control-plane VM
 ${KOPS} toolbox enroll --cluster ${CLUSTER_NAME} --instance-group control-plane-main --host ${VM0_IP} --pod-cidr ${VM0_POD_CIDR} --v=2
 # Manual creation of "volumes" for etcd, and setting up peer nodes
 cat <<EOF | ssh root@${VM0_IP} tee -a /etc/hosts
 # Hosts added for etcd discovery
 10.123.45.10 node0.main.${CLUSTER_NAME}
 10.123.45.10 node0.events.${CLUSTER_NAME}
 EOF
 ssh root@${VM0_IP} cat /etc/hosts
 ssh root@${VM0_IP} mkdir -p /mnt/disks/${CLUSTER_NAME}--main--0/mnt
 ssh root@${VM0_IP} touch    /mnt/disks/${CLUSTER_NAME}--main--0/mnt/please-create-new-cluster
 ssh root@${VM0_IP} mkdir -p /mnt/disks/${CLUSTER_NAME}--events--0/mnt
 ssh root@${VM0_IP} touch    /mnt/disks/${CLUSTER_NAME}--events--0/mnt/please-create-new-cluster
 echo "Waiting for kube to start"
 # Wait for kube-apiserver to be ready, timeout after 10 minutes
 for i in {1..60}; do
  if kubectl get nodes; then
    break
  fi
  sleep 10
 done
 kubectl get nodes
 kubectl get pods -A
 # Install kindnet
 kubectl create -f https://raw.githubusercontent.com/aojea/kindnet/main/install-kindnet.yaml
 echo "Waiting 10 seconds for kindnet to start"
 sleep 10
 kubectl get nodes
 kubectl get pods -A
 # For host records
 kubectl create ns kops-system
 kubectl apply -f ${REPO_ROOT}/k8s/crds/kops.k8s.io_hosts.yaml
 # kops-controller extra permissions
 kubectl apply --server-side -f - <<EOF
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: kops-controller:pki-verifier
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kops-controller:pki-verifier
 subjects:
 - apiGroup: rbac.authorization.k8s.io
  kind: User
  name: system:serviceaccount:kube-system:kops-controller
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: kops-controller:pki-verifier
 rules:
 - apiGroups:
  - "kops.k8s.io"
  resources:
  - hosts
  verbs:
  - get
  - list
  - watch
 # Must be able to set node addresses
 # TODO: Move out?
 - apiGroups:
  - ""
  resources:
  - nodes/status
  verbs:
  - patch
 EOF
 function enroll_node() {
  local node_ip=$1
 # Manual "discovery" for control-plane endpoints
 # TODO: Replace with well-known IP
 cat <<EOF | ssh root@${node_ip} tee -a /etc/hosts
 # Hosts added for leader discovery
 10.123.45.10 kops-controller.internal.${CLUSTER_NAME}
 10.123.45.10 api.internal.${CLUSTER_NAME}
 EOF
 timeout 10m ${KOPS} toolbox enroll --cluster ${CLUSTER_NAME} --instance-group nodes-main --host ${node_ip} --v=2
 }
 enroll_node ${VM1_IP} ${VM1_POD_CIDR}
 enroll_node ${VM2_IP} ${VM2_POD_CIDR}
 echo "Waiting 30 seconds for nodes to be ready"
 sleep 30
 kubectl get nodes
 kubectl get nodes -o yaml
 kubectl get pods -A
 # Ensure the cluster passes validation
 ${KOPS} validate cluster ${CLUSTER_NAME} --wait=10m
 # Run a few bare-metal e2e tests
 echo "running e2e tests"
 cd ${REPO_ROOT}/tests/e2e/scenarios/bare-metal
 go test -v .
 echo "Test successful"