From 696f30021fa05901457b36e4bfc9efdc7390345a Mon Sep 17 00:00:00 2001 From: Alejandro Ruiz Date: Fri, 17 Jan 2025 17:33:37 +0100 Subject: [PATCH 01/13] Add support for specifying arbitrary helm values for installing Rancher --- cmd/dartboard/subcommands/deploy.go | 28 +++++++++++++++++++++++++--- darts/aws.yaml | 4 ++++ darts/azure.yaml | 4 ++++ darts/k3d.yaml | 5 +++++ internal/dart/recipe.go | 1 + internal/helm/helm.go | 3 ++- 6 files changed, 41 insertions(+), 4 deletions(-) diff --git a/cmd/dartboard/subcommands/deploy.go b/cmd/dartboard/subcommands/deploy.go index 95f669e..6e28570 100644 --- a/cmd/dartboard/subcommands/deploy.go +++ b/cmd/dartboard/subcommands/deploy.go @@ -114,7 +114,7 @@ func Deploy(cli *cli.Context) error { return GetAccess(cli) } -func chartInstall(kubeConf string, chart chart, vals map[string]any) error { +func chartInstall(kubeConf string, chart chart, vals map[string]any, extraArgs ...string) error { var err error name := chart.name @@ -126,7 +126,7 @@ func chartInstall(kubeConf string, chart chart, vals map[string]any) error { log.Printf("Installing chart %q (%s)\n", namespace+"/"+name, path) - if err = helm.Install(kubeConf, path, name, namespace, vals); err != nil { + if err = helm.Install(kubeConf, path, name, namespace, vals, extraArgs...); err != nil { return fmt.Errorf("chart %s: %w", name, err) } return nil @@ -190,7 +190,29 @@ func chartInstallRancher(r *dart.Dart, rancherImageTag string, cluster *tofu.Clu chartVals := getRancherValsJSON(r.ChartVariables.RancherImageOverride, rancherImageTag, r.ChartVariables.AdminPassword, rancherClusterName, rancherClusterURL, r.ChartVariables.RancherReplicas) - return chartInstall(cluster.Kubeconfig, chartRancher, chartVals) + var extraArgs []string + if r.ChartVariables.Values != "" { + p, err := writeValuesFile(r.ChartVariables.Values) + if err != nil { + return fmt.Errorf("writing extra values file: %w", err) + } + defer os.Remove(p) + + extraArgs = append(extraArgs, "-f", p) + } + + return chartInstall(cluster.Kubeconfig, chartRancher, chartVals, extraArgs...) +} + +func writeValuesFile(content string) (string, error) { + p, err := os.CreateTemp("", "values-*.yaml") + if err != nil { + return "", err + } + if _, err := io.WriteString(p, content); err != nil { + return "", err + } + return p.Name(), nil } func chartInstallRancherIngress(cluster *tofu.Cluster) error { diff --git a/darts/aws.yaml b/darts/aws.yaml index f8ae591..7be1ccf 100644 --- a/darts/aws.yaml +++ b/darts/aws.yaml @@ -77,6 +77,10 @@ chart_variables: # rancher_image_override: rancher/rancher # rancher_image_tag_override: v2.8.6-debug-1 +# Set arbitrary helm values (in yaml format) for installing Rancher +# values: | +# features: "my-feature-flag=true" + test_variables: test_config_maps: 2000 test_secrets: 2000 diff --git a/darts/azure.yaml b/darts/azure.yaml index 69a3d23..3b6863e 100644 --- a/darts/azure.yaml +++ b/darts/azure.yaml @@ -103,6 +103,10 @@ chart_variables: # rancher_image_override: rancher/rancher # rancher_image_tag_override: v2.8.6-debug-1 +# Set arbitrary helm values (in yaml format) for installing Rancher +# values: | +# features: "my-feature-flag=true" + test_variables: test_config_maps: 2000 test_secrets: 2000 diff --git a/darts/k3d.yaml b/darts/k3d.yaml index 451b63c..bc71cb6 100644 --- a/darts/k3d.yaml +++ b/darts/k3d.yaml @@ -59,6 +59,11 @@ chart_variables: # rancher_image_override: rancher/rancher # rancher_image_tag_override: v2.8.6-debug-1 +# Set arbitrary helm values (in yaml format) for installing Rancher +# values: | +# features: "my-feature-flag=true" + + test_variables: test_config_maps: 2000 test_secrets: 2000 diff --git a/internal/dart/recipe.go b/internal/dart/recipe.go index aaa18f9..4637d98 100644 --- a/internal/dart/recipe.go +++ b/internal/dart/recipe.go @@ -32,6 +32,7 @@ type ChartVariables struct { RancherMonitoringVersion string `yaml:"rancher_monitoring_version"` CertManagerVersion string `yaml:"cert_manager_version"` TesterGrafanaVersion string `yaml:"tester_grafana_version"` + Values string `yaml:"values"` } type TestVariables struct { diff --git a/internal/helm/helm.go b/internal/helm/helm.go index 0553798..e79f2f9 100644 --- a/internal/helm/helm.go +++ b/internal/helm/helm.go @@ -25,7 +25,7 @@ import ( "github.com/rancher/dartboard/internal/vendored" ) -func Install(kubecfg, chartLocation, releaseName, namespace string, vals map[string]any) error { +func Install(kubecfg, chartLocation, releaseName, namespace string, vals map[string]any, extraArgs ...string) error { args := []string{ "--kubeconfig=" + kubecfg, "upgrade", @@ -46,6 +46,7 @@ func Install(kubecfg, chartLocation, releaseName, namespace string, vals map[str } args = append(args, "--set-json="+valueString) } + args = append(args, extraArgs...) cmd := vendored.Command("helm", args...) var errStream strings.Builder From d8e1c3dbd72bcc636c7f59d632364cd2bca8a206 Mon Sep 17 00:00:00 2001 From: Alejandro Ruiz Date: Mon, 20 Jan 2025 12:10:46 +0100 Subject: [PATCH 02/13] Rename key to rancher_values --- cmd/dartboard/subcommands/deploy.go | 4 ++-- darts/aws.yaml | 2 +- darts/azure.yaml | 2 +- darts/k3d.yaml | 2 +- internal/dart/recipe.go | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmd/dartboard/subcommands/deploy.go b/cmd/dartboard/subcommands/deploy.go index 6e28570..236b6c9 100644 --- a/cmd/dartboard/subcommands/deploy.go +++ b/cmd/dartboard/subcommands/deploy.go @@ -191,8 +191,8 @@ func chartInstallRancher(r *dart.Dart, rancherImageTag string, cluster *tofu.Clu chartVals := getRancherValsJSON(r.ChartVariables.RancherImageOverride, rancherImageTag, r.ChartVariables.AdminPassword, rancherClusterName, rancherClusterURL, r.ChartVariables.RancherReplicas) var extraArgs []string - if r.ChartVariables.Values != "" { - p, err := writeValuesFile(r.ChartVariables.Values) + if r.ChartVariables.RancherValues != "" { + p, err := writeValuesFile(r.ChartVariables.RancherValues) if err != nil { return fmt.Errorf("writing extra values file: %w", err) } diff --git a/darts/aws.yaml b/darts/aws.yaml index 7be1ccf..c151ca4 100644 --- a/darts/aws.yaml +++ b/darts/aws.yaml @@ -78,7 +78,7 @@ chart_variables: # rancher_image_tag_override: v2.8.6-debug-1 # Set arbitrary helm values (in yaml format) for installing Rancher -# values: | +# rancher_values: | # features: "my-feature-flag=true" test_variables: diff --git a/darts/azure.yaml b/darts/azure.yaml index 3b6863e..17a419a 100644 --- a/darts/azure.yaml +++ b/darts/azure.yaml @@ -104,7 +104,7 @@ chart_variables: # rancher_image_tag_override: v2.8.6-debug-1 # Set arbitrary helm values (in yaml format) for installing Rancher -# values: | +# rancher_values: | # features: "my-feature-flag=true" test_variables: diff --git a/darts/k3d.yaml b/darts/k3d.yaml index bc71cb6..1819145 100644 --- a/darts/k3d.yaml +++ b/darts/k3d.yaml @@ -60,7 +60,7 @@ chart_variables: # rancher_image_tag_override: v2.8.6-debug-1 # Set arbitrary helm values (in yaml format) for installing Rancher -# values: | +# rancher_values: | # features: "my-feature-flag=true" diff --git a/internal/dart/recipe.go b/internal/dart/recipe.go index 4637d98..3471339 100644 --- a/internal/dart/recipe.go +++ b/internal/dart/recipe.go @@ -32,7 +32,7 @@ type ChartVariables struct { RancherMonitoringVersion string `yaml:"rancher_monitoring_version"` CertManagerVersion string `yaml:"cert_manager_version"` TesterGrafanaVersion string `yaml:"tester_grafana_version"` - Values string `yaml:"values"` + RancherValues string `yaml:"rancher_values"` } type TestVariables struct { From f6b58e5c8677184e23fb5374ab52e2d32811acc8 Mon Sep 17 00:00:00 2001 From: Alejandro Ruiz Date: Tue, 21 Jan 2025 16:27:23 +0100 Subject: [PATCH 03/13] Use correct cluster name when importing images into k3d --- cmd/dartboard/subcommands/utils.go | 3 +-- internal/k3d/k3d.go | 5 ++--- internal/tofu/tofu.go | 1 + tofu/modules/k3d/k3s/main.tf | 13 +++++++------ tofu/modules/k3d/k3s/outputs.tf | 1 + 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cmd/dartboard/subcommands/utils.go b/cmd/dartboard/subcommands/utils.go index 3238ff5..65e5b11 100644 --- a/cmd/dartboard/subcommands/utils.go +++ b/cmd/dartboard/subcommands/utils.go @@ -162,8 +162,7 @@ func importImageIntoK3d(tf *tofu.Tofu, image string, cluster tofu.Cluster) error } if len(images) > 0 { - err = k3d.ImageImport(cluster, images[0]) - if err != nil { + if err := k3d.ImageImport(cluster.Name, images[0]); err != nil { return err } } diff --git a/internal/k3d/k3d.go b/internal/k3d/k3d.go index 885f826..0f754cb 100644 --- a/internal/k3d/k3d.go +++ b/internal/k3d/k3d.go @@ -21,12 +21,11 @@ import ( "os" "strings" - "github.com/rancher/dartboard/internal/tofu" "github.com/rancher/dartboard/internal/vendored" ) -func ImageImport(cluster tofu.Cluster, image string) error { - args := []string{"image", "import", "--cluster", strings.Replace(cluster.Context, "k3d-", "", -1), image} +func ImageImport(k3dClusterName string, image string) error { + args := []string{"image", "import", "--cluster", k3dClusterName, image} cmd := vendored.Command("k3d", args...) var errStream strings.Builder diff --git a/internal/tofu/tofu.go b/internal/tofu/tofu.go index 2e2e230..f7fe22c 100644 --- a/internal/tofu/tofu.go +++ b/internal/tofu/tofu.go @@ -52,6 +52,7 @@ type Addresses struct { type Cluster struct { AppAddresses ClusterAppAddresses `json:"app_addresses"` + Name string `json:"name"` Context string `json:"context"` IngressClassName string `json:"ingress_class_name"` Kubeconfig string `json:"kubeconfig"` diff --git a/tofu/modules/k3d/k3s/main.tf b/tofu/modules/k3d/k3s/main.tf index 425bcbd..6882f3d 100644 --- a/tofu/modules/k3d/k3s/main.tf +++ b/tofu/modules/k3d/k3s/main.tf @@ -325,6 +325,7 @@ resource "k3d_cluster" "cluster" { locals { local_kubernetes_api_url = nonsensitive(k3d_cluster.cluster[0].credentials[0].host) + k3d_cluster_name = "${var.project_name}-${var.name}" } resource "local_file" "kubeconfig" { @@ -337,19 +338,19 @@ resource "local_file" "kubeconfig" { certificate-authority-data = base64encode(k3d_cluster.cluster[0].credentials[0].cluster_ca_certificate) server = local.local_kubernetes_api_url } - name = "k3d-${var.project_name}-${var.name}" + name = "k3d-${local.k3d_cluster_name}" } ] contexts = [ { context = { - cluster = "k3d-${var.project_name}-${var.name}" - user : "admin@k3d-${var.project_name}-${var.name}" + cluster = "k3d-${local.k3d_cluster_name}" + user : "admin@k3d-${local.k3d_cluster_name}" } - name = "k3d-${var.project_name}-${var.name}" + name = "k3d-${local.k3d_cluster_name}" } ] - current-context = "k3d-${var.project_name}-${var.name}" + current-context = "k3d-${local.k3d_cluster_name}" kind = "Config" preferences = {} users = [ @@ -358,7 +359,7 @@ resource "local_file" "kubeconfig" { client-certificate-data : base64encode(k3d_cluster.cluster[0].credentials[0].client_certificate) client-key-data : base64encode(k3d_cluster.cluster[0].credentials[0].client_key) } - name : "admin@k3d-${var.project_name}-${var.name}" + name : "admin@k3d-${local.k3d_cluster_name}" } ] }) diff --git a/tofu/modules/k3d/k3s/outputs.tf b/tofu/modules/k3d/k3s/outputs.tf index c7d0b37..565ef1d 100644 --- a/tofu/modules/k3d/k3s/outputs.tf +++ b/tofu/modules/k3d/k3s/outputs.tf @@ -2,6 +2,7 @@ output "config" { value = { kubeconfig = var.server_count > 0 ? abspath(local_file.kubeconfig[0].filename) : null context = var.name + name = local.k3d_cluster_name // addresses of the Kubernetes API server kubernetes_addresses = { From 5608fcc69ae96a410736770056494c98bff052d6 Mon Sep 17 00:00:00 2001 From: Iramis Valentin <85186645+git-ival@users.noreply.github.com> Date: Wed, 22 Jan 2025 07:32:34 +0000 Subject: [PATCH 04/13] update .gitignore --- .gitignore | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 6bf284a..b15c108 100644 --- a/.gitignore +++ b/.gitignore @@ -2,10 +2,10 @@ .DS_Store # tofu related -tofu/main/*/.terraform -tofu/main/*/.terraform.lock.hcl -tofu/main/*/terraform.tfstate -tofu/main/*/terraform.tfstate.* +tofu/**/*/.terraform +tofu/**/*/.terraform.lock.hcl +tofu/**/*/terraform.tfstate +tofu/**/*/terraform.tfstate.* tofu/main/*/*config *.tfvars From dca6a122d3463293116b937ac143171e660e7ec4 Mon Sep 17 00:00:00 2001 From: Iramis Valentin <85186645+git-ival@users.noreply.github.com> Date: Wed, 22 Jan 2025 07:33:15 +0000 Subject: [PATCH 05/13] enable byo vpc functionality --- tofu/main/aws/main.tf | 1 + tofu/main/aws/variables.tf | 6 + tofu/modules/aws/network/data.tf | 84 ++++++++++++++ tofu/modules/aws/network/main.tf | 158 ++++++++++++++------------ tofu/modules/aws/network/outputs.tf | 6 +- tofu/modules/aws/network/variables.tf | 7 ++ 6 files changed, 185 insertions(+), 77 deletions(-) create mode 100644 tofu/modules/aws/network/data.tf diff --git a/tofu/main/aws/main.tf b/tofu/main/aws/main.tf index 90f6db1..bd74851 100644 --- a/tofu/main/aws/main.tf +++ b/tofu/main/aws/main.tf @@ -8,6 +8,7 @@ module "network" { project_name = var.project_name region = var.region availability_zone = var.availability_zone + existing_vpc_name = var.existing_vpc_name bastion_host_ami = length(var.bastion_host_ami) > 0 ? var.bastion_host_ami : null ssh_bastion_user = var.ssh_bastion_user ssh_public_key_path = var.ssh_public_key_path diff --git a/tofu/main/aws/variables.tf b/tofu/main/aws/variables.tf index fd28bac..42fa4ed 100644 --- a/tofu/main/aws/variables.tf +++ b/tofu/main/aws/variables.tf @@ -87,6 +87,12 @@ variable "availability_zone" { default = "us-east-1a" } +variable "existing_vpc_name" { + description = "Name of existing VPC to use. If null, a new VPC will be created" + type = string + default = null +} + variable "bastion_host_ami" { description = "AMI ID" default = "ami-0e55a8b472a265e3f" diff --git a/tofu/modules/aws/network/data.tf b/tofu/modules/aws/network/data.tf new file mode 100644 index 0000000..5ba1198 --- /dev/null +++ b/tofu/modules/aws/network/data.tf @@ -0,0 +1,84 @@ +# Data source to look up existing VPC +data "aws_vpc" "existing" { + count = local.create_vpc ? 0 : 1 + + filter { + name = "tag:Name" + values = [var.existing_vpc_name] + } +} + +data "aws_internet_gateway" "existing" { + count = local.create_vpc ? 0 : 1 + filter { + name = "attachment.vpc-id" + values = [local.vpc_id] + } +} + +# Data sources to look up existing subnets +data "aws_subnet" "public" { + count = local.create_vpc ? 0 : 1 + vpc_id = one(data.aws_vpc.existing[*].id) + availability_zone = var.availability_zone + + # filter { + # name = "vpc-id" + # values = [one(data.aws_vpc.existing[*].id)] + # } + + # filter { + # name = "availability-zone" + # values = [var.availability_zone] + # } + + tags = { + Name = "*public*", + Tier = "Public" + } +} + +data "aws_subnet" "private" { + count = local.create_vpc ? 0 : 1 + vpc_id = one(data.aws_vpc.existing[*].id) + availability_zone = var.availability_zone + + # filter { + # name = "vpc-id" + # values = [one(data.aws_vpc.existing[*].id)] + # } + + # filter { + # name = "availability-zone" + # values = [var.availability_zone] + # } + + tags = { + Name = "*private*" + Tier = "Private" + } +} + +data "aws_subnet" "secondary_private" { + count = local.create_vpc && var.secondary_availability_zone != null ? 0 : 1 + vpc_id = one(data.aws_vpc.existing[*].id) + availability_zone = var.secondary_availability_zone + +# filter { +# name = "vpc-id" +# values = [one(data.aws_vpc.existing[*].id)] +# } + +# dynamic "filter" { +# for_each = var.secondary_availability_zone != null ? [1] : [] +# content { +# name = "availability-zone" +# values = [var.secondary_availability_zone] +# } +# } + + tags = { + Name = "*secondary*private*" + Tier = "SecondaryPrivate" + } +} diff --git a/tofu/modules/aws/network/main.tf b/tofu/modules/aws/network/main.tf index c99c843..278c513 100644 --- a/tofu/modules/aws/network/main.tf +++ b/tofu/modules/aws/network/main.tf @@ -1,11 +1,6 @@ -/* - This module sets up a class B VPC sliced into three subnets, one public and one or two private. - The public network has an Internet Gateway and accepts SSH connections only. - The private networks have Internet access but do not accept any connections. - A secondary private connection is optional, and is used to support RDS use cases. -*/ - +# VPC resource created only when existing_vpc_name is null resource "aws_vpc" "main" { + count = local.create_vpc ? 1 : 0 cidr_block = "172.16.0.0/16" enable_dns_support = true enable_dns_hostnames = true @@ -16,7 +11,21 @@ resource "aws_vpc" "main" { } } +# Update locals to use coalescing for resource selection +locals { + vpc_id = coalesce(one(aws_vpc.main[*].id), one(data.aws_vpc.existing[*].id)) + vpc_cidr_block = coalesce(one(aws_vpc.main[*].cidr_block), one(data.aws_vpc.existing[*].cidr_block)) + internet_gateway_id = coalesce(one(aws_internet_gateway.main[*].id), one(data.aws_internet_gateway.existing[*].id)) + + public_subnet_id = coalesce(one(aws_subnet.public[*].id), one(data.aws_subnet.public[*].id)) + private_subnet_id = coalesce(one(aws_subnet.private[*].id), one(data.aws_subnet.private[*].id)) + secondary_private_subnet_id = coalesce(one(aws_subnet.secondary_private[*].id), one(data.aws_subnet.secondary_private[*].id)) + + create_vpc = var.existing_vpc_name == null +} + resource "aws_internet_gateway" "main" { + count = local.create_vpc ? 1 : 0 vpc_id = local.vpc_id tags = { @@ -25,12 +34,8 @@ resource "aws_internet_gateway" "main" { } } -locals { - vpc_id = aws_vpc.main.id - vpc_cidr_block = aws_vpc.main.cidr_block -} - resource "aws_eip" "nat_eip" { + tags = { Project = var.project_name Name = "${var.project_name}-nat-eip" @@ -39,9 +44,9 @@ resource "aws_eip" "nat_eip" { resource "aws_nat_gateway" "nat" { allocation_id = aws_eip.nat_eip.id - subnet_id = aws_subnet.public.id + subnet_id = local.public_subnet_id - depends_on = [aws_internet_gateway.main] + depends_on = [data.aws_internet_gateway.existing, aws_internet_gateway.main] tags = { Project = var.project_name @@ -49,17 +54,52 @@ resource "aws_nat_gateway" "nat" { } } -resource "aws_route_table" "public" { - vpc_id = local.vpc_id - - route { - cidr_block = "0.0.0.0/0" - gateway_id = aws_internet_gateway.main.id - } +resource "aws_subnet" "public" { + count = local.create_vpc ? 1 : 0 + availability_zone = var.availability_zone + vpc_id = local.vpc_id + cidr_block = "172.16.0.0/24" + map_public_ip_on_launch = true tags = { Project = var.project_name - Name = "${var.project_name}-public-route-table" + Name = "${var.project_name}-public-subnet" + } +} + +resource "aws_subnet" "private" { + count = local.create_vpc ? 1 : 0 + availability_zone = var.availability_zone + vpc_id = local.vpc_id + cidr_block = "172.16.1.0/24" + map_public_ip_on_launch = false + + tags = { + Project = var.project_name + Name = "${var.project_name}-private-subnet" + } +} + +resource "aws_subnet" "secondary_private" { + count = local.create_vpc && var.secondary_availability_zone != null ? 1 : 0 + availability_zone = var.secondary_availability_zone + vpc_id = local.vpc_id + cidr_block = "172.16.2.0/24" + map_public_ip_on_launch = false + + tags = { + Project = var.project_name + Name = "${var.project_name}-secondary-private-subnet" + } +} + +resource "aws_key_pair" "key_pair" { + key_name = "${var.project_name}-key-pair" + public_key = file(var.ssh_public_key_path) + + tags = { + Project = var.project_name + Name = "${var.project_name}-ssh-key-pair" } } @@ -68,6 +108,20 @@ resource "aws_main_route_table_association" "vpc_internet" { route_table_id = aws_route_table.public.id } +resource "aws_route_table" "public" { + vpc_id = local.vpc_id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = local.internet_gateway_id + } + + tags = { + Project = var.project_name + Name = "${var.project_name}-public-route-table" + } +} + resource "aws_route_table" "private" { vpc_id = local.vpc_id @@ -82,53 +136,16 @@ resource "aws_route_table" "private" { } } -resource "aws_subnet" "public" { - availability_zone = var.availability_zone - vpc_id = local.vpc_id - cidr_block = "172.16.0.0/24" - map_public_ip_on_launch = true - - tags = { - Project = var.project_name - Name = "${var.project_name}-public-subnet" - } -} - resource "aws_route_table_association" "public" { - subnet_id = aws_subnet.public.id + subnet_id = local.public_subnet_id route_table_id = aws_route_table.public.id } -resource "aws_subnet" "private" { - availability_zone = var.availability_zone - vpc_id = local.vpc_id - cidr_block = "172.16.1.0/24" - map_public_ip_on_launch = false - - tags = { - Project = var.project_name - Name = "${var.project_name}-private-subnet" - } -} - resource "aws_route_table_association" "private" { - subnet_id = aws_subnet.private.id + subnet_id = local.private_subnet_id route_table_id = aws_route_table.private.id } -resource "aws_subnet" "secondary_private" { - count = var.secondary_availability_zone != null ? 1 : 0 - availability_zone = var.secondary_availability_zone - vpc_id = local.vpc_id - cidr_block = "172.16.2.0/24" - map_public_ip_on_launch = false - - tags = { - Project = var.project_name - Name = "${var.project_name}-secondary-private-subnet" - } -} - resource "aws_route_table_association" "secondary_private" { count = var.secondary_availability_zone != null ? 1 : 0 subnet_id = aws_subnet.secondary_private[0].id @@ -136,6 +153,7 @@ resource "aws_route_table_association" "secondary_private" { } resource "aws_vpc_dhcp_options" "dhcp_options" { + count = local.create_vpc ? 1 : 0 domain_name = var.region == "us-east-1" ? "ec2.internal" : "${var.region}.compute.internal" domain_name_servers = ["AmazonProvidedDNS"] @@ -146,8 +164,9 @@ resource "aws_vpc_dhcp_options" "dhcp_options" { } resource "aws_vpc_dhcp_options_association" "vpc_dhcp_options" { + count = local.create_vpc ? 1 : 0 vpc_id = local.vpc_id - dhcp_options_id = aws_vpc_dhcp_options.dhcp_options.id + dhcp_options_id = aws_vpc_dhcp_options.dhcp_options[0].id } resource "aws_security_group" "public" { @@ -222,16 +241,7 @@ resource "aws_security_group" "private" { } } -resource "aws_key_pair" "key_pair" { - key_name = "${var.project_name}-key-pair" - public_key = file(var.ssh_public_key_path) - - tags = { - Project = var.project_name - Name = "${var.project_name}-ssh-key-pair" - } -} - +# Update the bastion module configuration module "bastion" { source = "../node" project_name = var.project_name @@ -247,8 +257,8 @@ module "bastion" { } network_config = { availability_zone : var.availability_zone, - public_subnet_id : aws_subnet.public.id - private_subnet_id : aws_subnet.private.id + public_subnet_id : local.public_subnet_id + private_subnet_id : local.private_subnet_id secondary_private_subnet_id : var.secondary_availability_zone != null ? aws_subnet.secondary_private[0].id : null public_security_group_id : aws_security_group.public.id private_security_group_id : aws_security_group.private.id diff --git a/tofu/modules/aws/network/outputs.tf b/tofu/modules/aws/network/outputs.tf index d44c3e5..a6318f6 100644 --- a/tofu/modules/aws/network/outputs.tf +++ b/tofu/modules/aws/network/outputs.tf @@ -1,9 +1,9 @@ output "config" { value = { availability_zone : var.availability_zone, - public_subnet_id : aws_subnet.public.id, - private_subnet_id : aws_subnet.private.id, - secondary_private_subnet_id : var.secondary_availability_zone != null ? aws_subnet.secondary_private[0].id : null, + public_subnet_id : local.public_subnet_id, + private_subnet_id : local.private_subnet_id, + secondary_private_subnet_id : var.secondary_availability_zone != null ? local.secondary_private_subnet_id : null, public_security_group_id : aws_security_group.public.id, private_security_group_id : aws_security_group.private.id, ssh_key_name : aws_key_pair.key_pair.key_name, diff --git a/tofu/modules/aws/network/variables.tf b/tofu/modules/aws/network/variables.tf index f4da6c6..32ddca9 100644 --- a/tofu/modules/aws/network/variables.tf +++ b/tofu/modules/aws/network/variables.tf @@ -45,3 +45,10 @@ variable "bastion_host_instance_type" { description = "EC2 instance type" default = "t4g.small" } + +# Variables for existing VPC configuration +variable "existing_vpc_name" { + description = "Name of existing VPC to use. If null, a new VPC will be created" + type = string + default = null +} From e8422c2e8347d142c21b03a930df79169b17bda4 Mon Sep 17 00:00:00 2001 From: Iramis Valentin <85186645+git-ival@users.noreply.github.com> Date: Wed, 22 Jan 2025 18:13:46 +0000 Subject: [PATCH 06/13] add type hints to variables to ensure opentofu parses them as the intended types --- tofu/modules/generic/k3s/variables.tf | 7 +++++++ tofu/modules/generic/rke2/variables.tf | 7 +++++++ tofu/modules/generic/test_environment/variables.tf | 3 +++ 3 files changed, 17 insertions(+) diff --git a/tofu/modules/generic/k3s/variables.tf b/tofu/modules/generic/k3s/variables.tf index 2a0a36e..606fad9 100644 --- a/tofu/modules/generic/k3s/variables.tf +++ b/tofu/modules/generic/k3s/variables.tf @@ -15,11 +15,13 @@ variable "distro_version" { variable "server_count" { description = "Number of server nodes in this cluster" + type = number default = 1 } variable "agent_count" { description = "Number of agent nodes in this cluster" + type = number default = 0 } @@ -41,16 +43,19 @@ variable "ssh_user" { variable "local_kubernetes_api_port" { description = "Local port this cluster's Kubernetes API will be published to (via SSH tunnel)" + type = number default = 6445 } variable "tunnel_app_http_port" { description = "Local port this cluster's http endpoints will be published to (via SSH tunnel)" + type = number default = 8080 } variable "tunnel_app_https_port" { description = "Local port this cluster's https endpoints will be published to (via SSH tunnel)" + type = number default = 8443 } @@ -62,11 +67,13 @@ variable "sans" { variable "max_pods" { description = "Maximum number of pods per node" + type = number default = 110 } variable "node_cidr_mask_size" { description = "Size of the CIDR mask for nodes. Increase when increasing max_pods so that 2^(32-node_cidr_max_size) > 2 * max_pods" + type = number default = 24 } diff --git a/tofu/modules/generic/rke2/variables.tf b/tofu/modules/generic/rke2/variables.tf index 20520d4..6076161 100644 --- a/tofu/modules/generic/rke2/variables.tf +++ b/tofu/modules/generic/rke2/variables.tf @@ -16,11 +16,13 @@ variable "distro_version" { variable "server_count" { description = "Number of server nodes in this cluster" + type = number default = 1 } variable "agent_count" { description = "Number of agent nodes in this cluster" + type = number default = 0 } @@ -42,16 +44,19 @@ variable "ssh_user" { variable "local_kubernetes_api_port" { description = "Port this cluster's Kubernetes API will be published to (for inclusion in kubeconfig)" + type = number default = 6443 } variable "tunnel_app_http_port" { description = "Local port this cluster's http endpoints will be published to (via SSH tunnel)" + type = number default = 8080 } variable "tunnel_app_https_port" { description = "Local port this cluster's https endpoints will be published to (via SSH tunnel)" + type = number default = 8443 } @@ -63,11 +68,13 @@ variable "sans" { variable "max_pods" { description = "Maximum number of pods per node" + type = number default = 110 } variable "node_cidr_mask_size" { description = "Size of the CIDR mask for nodes. Increase when increasing max_pods so that 2^(32-node_cidr_max_size) > 2 * max_pods" + type = number default = 24 } diff --git a/tofu/modules/generic/test_environment/variables.tf b/tofu/modules/generic/test_environment/variables.tf index a0d2272..5b674d6 100644 --- a/tofu/modules/generic/test_environment/variables.tf +++ b/tofu/modules/generic/test_environment/variables.tf @@ -98,15 +98,18 @@ variable "project_name" { variable "first_kubernetes_api_port" { description = "Port number where the Kubernetes API of the first cluster is published locally. Other clusters' ports are published in successive ports" + type = number default = 7445 } variable "first_app_http_port" { description = "Port number where the first server's port 80 is published locally. Other clusters' ports are published in successive ports" + type = number default = 9080 } variable "first_app_https_port" { description = "Port number where the first server's port 443 is published locally. Other clusters' ports are published in successive ports" + type = number default = 9443 } From c033f3a3dba1d53c7e25038cd46604e1eaf110e8 Mon Sep 17 00:00:00 2001 From: Iramis Valentin <85186645+git-ival@users.noreply.github.com> Date: Wed, 22 Jan 2025 18:29:13 +0000 Subject: [PATCH 07/13] add setup required for BYO VPC, remove dead commented code --- README.md | 14 +++++++++++++- tofu/modules/aws/network/data.tf | 33 -------------------------------- 2 files changed, 13 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 7275de6..8b1ea67 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,18 @@ To recreate environments: - `dartboard reapply` runs `destroy` and then `apply`, tearing down and recreating test configuration infrastructure without any software (Rancher, load generation, moniroting...) - `dartboard redeploy` runs `destroy` and then `deploy`, tearing down and recreating the full environment, infrastructure and software (use this if unsure) +### "Bring Your Own" AWS VPC +There is some manual configuration required in order to use an existing AWS VPC instead of having the tofu modules create a full set of networking resources. + +1. Have an existing VPC with a DHCP options set configured so that DNS = "AmazonProvidedDNS". +2. Create three subnets, requirements are as follows: + 1. One subnet should contain the substring "public" (case-sensitive), and should be tagged with `Tier = Public` (case-sensitive) + 2. One subnet should contain the substring "private" (case-sensitive), and should be tagged with `Tier = Private` (case-sensitive) + 3. One subnet should contain the substring "secondary-private" (case-sensitive), and should be tagged with `Tier = SecondaryPrivate` (case-sensitive) + 4. Each subnet should be assigned to the VPC you intend to use + +Once these resources are manually setup, you can set the `existing_vpc_name` tofu variable in your Dart file and deploy as you normally would. + ## Installation Download and unpack a [release](https://github.com/rancher/dartboard/releases/), it's a self-contained binary. @@ -80,7 +92,7 @@ pkill -f 'ssh .*-o IgnoreUnknown=TofuCreatedThisTunnel.*' If an Azure VM is not accessible via SSH, try the following: - add the `boot_diagnostics = true` option in `inputs.tf` - apply or re-deploy -- in the Azure Portal, click on Home -> Virtual Machines -> -> Help -> Reset Password +- in the Azure Portal, click on Home -> Virtual Machines -> -> Help -> Reset Password - then Home -> Virtual Machines -> -> Help -> Serial Console That should give you access to the VM's console, where you can log in with the new password and troubleshoot. diff --git a/tofu/modules/aws/network/data.tf b/tofu/modules/aws/network/data.tf index 5ba1198..cfacf13 100644 --- a/tofu/modules/aws/network/data.tf +++ b/tofu/modules/aws/network/data.tf @@ -22,16 +22,6 @@ data "aws_subnet" "public" { vpc_id = one(data.aws_vpc.existing[*].id) availability_zone = var.availability_zone - # filter { - # name = "vpc-id" - # values = [one(data.aws_vpc.existing[*].id)] - # } - - # filter { - # name = "availability-zone" - # values = [var.availability_zone] - # } - tags = { Name = "*public*", Tier = "Public" @@ -43,16 +33,6 @@ data "aws_subnet" "private" { vpc_id = one(data.aws_vpc.existing[*].id) availability_zone = var.availability_zone - # filter { - # name = "vpc-id" - # values = [one(data.aws_vpc.existing[*].id)] - # } - - # filter { - # name = "availability-zone" - # values = [var.availability_zone] - # } - tags = { Name = "*private*" Tier = "Private" @@ -64,19 +44,6 @@ data "aws_subnet" "secondary_private" { vpc_id = one(data.aws_vpc.existing[*].id) availability_zone = var.secondary_availability_zone -# filter { -# name = "vpc-id" -# values = [one(data.aws_vpc.existing[*].id)] -# } - -# dynamic "filter" { -# for_each = var.secondary_availability_zone != null ? [1] : [] -# content { -# name = "availability-zone" -# values = [var.secondary_availability_zone] -# } -# } - tags = { Name = "*secondary*private*" Tier = "SecondaryPrivate" From 52126f15215e6818aef97aed0d7c1230cd12d170 Mon Sep 17 00:00:00 2001 From: Silvio Moioli Date: Tue, 21 Jan 2025 14:43:51 +0100 Subject: [PATCH 08/13] DEVELOPING: add architectural notes Signed-off-by: Silvio Moioli --- DEVELOPING.md | 70 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/DEVELOPING.md b/DEVELOPING.md index 831f52a..655d918 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -2,30 +2,62 @@ ## Overall architecture - - OpenTofu is used to deploy infrastructure. That includes all is necessary in order to launch Kubernetes clusters - modules should conclude producing a kubeconfig file and context - - `tf` files in `tofu/main/` specify whole testing environments - - `tf` files in `tofu/modules/` implement components (platform-specific or platform-agnostic) - - the `dartboard` Golang program runs OpenTofu to create Kubernetes clusters, then Helm/kubectl to deploy and configure software under test (Rancher and/or any other component). It is designed to be idempotent - - a Mimir-backed Grafana instance in an own cluster displays results and acts as long-term result storage +For SUSE internal projects please see the [internal design document](https://docs.google.com/document/d/1-jgzGSmeH47mobXycuOgeg1W_wTB4AgY). -## Porting OpenTofu files to new platforms +For all uses, this project is composed of: + - [OpenTofu](http://opentofu.org) modules to deploy infrastructure. That includes all is necessary in order to launch Kubernetes clusters - modules should conclude producing a kubeconfig file and context + - the `dartboard` Golang program + - runs OpenTofu to create Kubernetes clusters + - uses Helm/kubectl to deploy and configure software under test (Rancher and/or any other component) + - uses Helm/kubectl to deploy and configure test software (Mimir, Grafana, k6, etc.) - - create a new `tofu/main` subdirectory copying over `tf` files from `aws` - - edit `variables.tf` to include any platform-specific information - - edit `main.tf` to use platform-specific providers, add modules as appropriate - - platform-specific modules are prefixed with the platform name (eg. `tofu/modules/aws_*`) - - platform-agnostic modules are not prefixed - - platform-specific wrappers are normally created for platform-agnostic modules (eg. `aws_k3s` wraps `k3s`) - - adapt `outputs.tf` - please note the exact structure is expected by `dartboard` - change with care +Specifically: + - `dartboard apply` is a `dartboard` subcommand that calls OpenTofu to deploy Kubernetes clusters. Outputs kubeconfig files and build-specific parameters. Created clusters are: + - upstream: where Rancher is installed + - downstream: that is imported into Rancher (can be zero or more) + - tester: where load testing/benchmarking/metric collection tools will run + - `dartboard deploy` is a `dartboard` subcommand that: + - calls `dartboard apply` to create clusters + - installs Rancher via Helm in the upstream cluster + - configures Rancher by creating basic objects (eg. users) + - imports downstream clusters created by dartboard apply into Rancher with Shepherd + - `dartboard test` is a `dartboard` subcommand that runs `k6` from a pod in the tester cluster + - `dartboard destroy` is a `dartboard` subcommand that calls OpenTofu to destroy clusters created by `dartboard apply` + - k6 is used to benchmark APIs in the upstream or downstream clusters, sending metrics to mimir. Runs in the tester cluster + - mimir is used to collect metrics from test runs (from k6 and instrumentation of the SUT, aka rancher-monitoring). Runs in the tester cluster. Allows for bulk data export in Prometheus format for later analysis. Plan is to store long-term data in a new Mimir + Grafana installation managed by the QA team -It is assumed all created clusters will be able to reach one another with the same domain names, from the same network. That network might not be the same network of the machine running OpenTofu. +## OpenTofu module specifics -Created clusters may or may not be directly reachable from the machine running OpenTofu. In the current `aws` implementation, for example, all access goes through an SSH bastion host and tunnels, but that is an implementation detail and may change in future. For new platforms there is no requirement - clusters might be directly reachable with an Internet-accessible FQDN, or be behind a bastion host, Tailscale, Boundary or other mechanism. Structures in `outputs.tf` have been designed to accommodate for all cases, in particular: - - `local_` variables refer to domain names and ports as used by the machine running OpenTofu, - - `private_` variables refer to domain names and ports as used by the clusters in their network, - - values may coincide. +In this project modules are organized according to these rules: + - `tofu/main/*` contains the main `tf` files that specify whole testing environments + - there is one subdirectory per platform (eg. `aws`, `azure`, `harvester`) + - `tofu/modules` contains reusable modules that can be used in multiple environments + - modules in the `tofu/modules/generic` directory are platform-agnostic + - modules in other directories are platform-specific (eg. `aws`, `azure`, `harvester`) + - modules are consistently named according to the concept they represent: + - **node**: a Linux VM capable of SSH login + - `node_variables` is a block of variables passed as-is from main to a platform-specific node module. It contains all details to create the VM that are specific to that one VM + - **cluster**: a Kubernetes cluster - possibly a set of nodes with a distribution installed, or a managed service + - **network**: anything that is shared among clusters and nodes and allows them to work together (actual networks, firewalls, rules, bastion hosts...) + - `network_configuration` is a block of outputs passed as-is from a platform-specific network module to node modules of the same platform. It contains details that are common to all VMs + - **test environment**: an upstream cluster, any number of downstream clusters and a tester cluster, all glued together with a single network + - everything else, typically generic software that can be installed onto nodes -`node_access_commands` are an optional convenience mechanism to allow a user to SSH into a particular node directly. +Assumptions: + - Deployed nodes and clusters are reachable either directly or via an SSH bastion host from the machine running OpenTofu + - Deployed nodes and clusters will be able to reach one another with the same domain names, from the same network. That network might not be the same network of the machine running OpenTofu + - Deployed clusters may or may not be directly reachable from the machine running OpenTofu. In the current `aws` implementation, for example, all access goes through an SSH bastion host and tunnels, but that is an implementation detail and may change in future. For new platforms there is no requirement - clusters might be directly reachable with an Internet-accessible FQDN, or be behind a bastion host, Tailscale, Boundary or other mechanism + +## Vendored binaries + +Dartboard vendors binaries it uses like OpenTofu, kubectl and Helm. These are decompressed and stored in the `.bin` directory at runtime. + +## Dart files + +YAML files in the `darts/` subdirectory represent full environments and contain all configuration to run a test. That includes: + - `tofu_main_directory`: a pointer to a main directory for OpenTofu modules + - `tofu_variables`: a block of variables passed as-is to OpenTofu + - any other test-specific variables ## Hacks and workarounds From 3eb7d2508e6f17cd8b77b354509007716bdc2c4d Mon Sep 17 00:00:00 2001 From: Silvio Moioli Date: Fri, 24 Jan 2025 12:48:28 +0100 Subject: [PATCH 09/13] docs: fix broken links Signed-off-by: Silvio Moioli --- docs/20220923 - 250 pods per node.md | 4 ++-- ...0 pods per node with cluster operations.md | 4 ++-- docs/20221003 - 300 pods per node.md | 6 +++--- docs/20221128 - API load benchmarks.md | 8 ++++---- docs/20221130 - can-i microbenchmark.md | 8 ++++---- docs/20221215 - kine locality test.md | 6 +++--- docs/20230306 - steve vai tests.md | 8 ++++---- ...20230503 - steve vai tests higher scale.md | 10 +++++----- ...RKE2 100 clusters 1vs3 nodes comparison.md | 20 +++++++++---------- 9 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/20220923 - 250 pods per node.md b/docs/20220923 - 250 pods per node.md index 3537e74..85ef97d 100644 --- a/docs/20220923 - 250 pods per node.md +++ b/docs/20220923 - 250 pods per node.md @@ -43,7 +43,7 @@ See [the rke2 installation script in this repo](../rke2/install_rke2.sh) for det ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20220923_250_pods_per_node](https://github.com/moio/scalability-tests/tree/20220923_250_pods_per_node) branch. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20220923_250_pods_per_node](https://github.com/rancher/dartboard/tree/20220923_250_pods_per_node) branch. Note in particular [inputs.tf](../inputs.tf) for the main parameters. @@ -58,7 +58,7 @@ Note in particular [inputs.tf](../inputs.tf) for the main parameters. - get [Terraform](https://www.terraform.io/downloads) - check out this project ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20220923_250_pods_per_node ``` diff --git a/docs/20220928 - 250 pods per node with cluster operations.md b/docs/20220928 - 250 pods per node with cluster operations.md index bf38af4..df56711 100644 --- a/docs/20220928 - 250 pods per node with cluster operations.md +++ b/docs/20220928 - 250 pods per node with cluster operations.md @@ -45,7 +45,7 @@ See [the rke2 installation script in this repo](../rke2/install_rke2.sh) for det ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20220928_250_pods_per_node_with_cluster_operations](https://github.com/moio/scalability-tests/tree/20220928_250_pods_per_node_with_cluster_operations) branch. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20220928_250_pods_per_node_with_cluster_operations](https://github.com/rancher/dartboard/tree/20220928_250_pods_per_node_with_cluster_operations) branch. Note in particular [inputs.tf](../inputs.tf) for the main parameters. @@ -62,7 +62,7 @@ Note in particular [inputs.tf](../inputs.tf) for the main parameters. - get [Terraform](https://www.terraform.io/downloads) - check out this project ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20220928_250_pods_per_node_with_cluster_operations ``` diff --git a/docs/20221003 - 300 pods per node.md b/docs/20221003 - 300 pods per node.md index 06f0497..886f547 100644 --- a/docs/20221003 - 300 pods per node.md +++ b/docs/20221003 - 300 pods per node.md @@ -79,8 +79,8 @@ See [the rke2 installation script in this repo](../rke2/install_rke2.sh) for det ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221003_300_pods_per_node](https://github.com/moio/scalability-tests/tree/20221003_300_pods_per_node) branch. Note in particular [inputs.tf](../inputs.tf) for the main parameters. -All tests are driven by [Cypress](https://www.cypress.io/) files in the [cypress](https://github.com/moio/scalability-tests/tree/20221003_300_pods_per_node/cypress) directory. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221003_300_pods_per_node](https://github.com/rancher/dartboard/tree/20221003_300_pods_per_node) branch. Note in particular [inputs.tf](../inputs.tf) for the main parameters. +All tests are driven by [Cypress](https://www.cypress.io/) files in the [cypress](https://github.com/rancher/dartboard/tree/20221003_300_pods_per_node/cypress) directory. ## Reproduction Instructions @@ -102,7 +102,7 @@ All tests are driven by [Cypress](https://www.cypress.io/) files in the [cypress - clone this project: ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20221003_300_pods_per_node ``` diff --git a/docs/20221128 - API load benchmarks.md b/docs/20221128 - API load benchmarks.md index 8774c02..cb73cb2 100644 --- a/docs/20221128 - API load benchmarks.md +++ b/docs/20221128 - API load benchmarks.md @@ -69,9 +69,9 @@ References: ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221128_api_load_benchmarks](https://github.com/moio/scalability-tests/tree/20221128_api_load_benchmarks/terraform) branch. Note in particular [inputs.tf](https://github.com/moio/scalability-tests/blob/20221128_api_load_benchmarks/terraform/inputs.tf) for the main parameters. -Initial configuration is driven by [Cypress](https://www.cypress.io/) files in the [cypress/e2e](https://github.com/moio/scalability-tests/tree/20221128_api_load_benchmarks/cypress/cypress/e2e) directory. -Benchmark Python scripts are available in the [util](https://github.com/moio/scalability-tests/tree/20221128_api_load_benchmarks/util) directory. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221128_api_load_benchmarks](https://github.com/rancher/dartboard/tree/20221128_api_load_benchmarks/terraform) branch. Note in particular [inputs.tf](https://github.com/rancher/dartboard/blob/20221128_api_load_benchmarks/terraform/inputs.tf) for the main parameters. +Initial configuration is driven by [Cypress](https://www.cypress.io/) files in the [cypress/e2e](https://github.com/rancher/dartboard/tree/20221128_api_load_benchmarks/cypress/cypress/e2e) directory. +Benchmark Python scripts are available in the [util](https://github.com/rancher/dartboard/tree/20221128_api_load_benchmarks/util) directory. ## Reproduction Instructions @@ -92,7 +92,7 @@ Benchmark Python scripts are available in the [util](https://github.com/moio/sca - clone this project: ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20221128_api_load_benchmarks ``` diff --git a/docs/20221130 - can-i microbenchmark.md b/docs/20221130 - can-i microbenchmark.md index 738f09f..e09ee25 100644 --- a/docs/20221130 - can-i microbenchmark.md +++ b/docs/20221130 - can-i microbenchmark.md @@ -45,9 +45,9 @@ References: ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221130_can-i_microbenchmark](https://github.com/moio/scalability-tests/tree/20221128_api_load_benchmarks/terraform) branch. Note in particular [inputs.tf](https://github.com/moio/scalability-tests/blob/20221130_can-i_microbenchmark/terraform/inputs.tf) for the main parameters. -Initial configuration is driven by [Cypress](https://www.cypress.io/) files in the [cypress/e2e](https://github.com/moio/scalability-tests/tree/20221130_can-i_microbenchmark/cypress/cypress/e2e) directory. -Benchmark Python scripts are available in the [util](https://github.com/moio/scalability-tests/tree/20221130_can-i_microbenchmark/util) directory. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221130_can-i_microbenchmark](https://github.com/rancher/dartboard/tree/20221128_api_load_benchmarks/terraform) branch. Note in particular [inputs.tf](https://github.com/rancher/dartboard/blob/20221130_can-i_microbenchmark/terraform/inputs.tf) for the main parameters. +Initial configuration is driven by [Cypress](https://www.cypress.io/) files in the [cypress/e2e](https://github.com/rancher/dartboard/tree/20221130_can-i_microbenchmark/cypress/cypress/e2e) directory. +Benchmark Python scripts are available in the [util](https://github.com/rancher/dartboard/tree/20221130_can-i_microbenchmark/util) directory. ## Reproduction Instructions @@ -68,7 +68,7 @@ Benchmark Python scripts are available in the [util](https://github.com/moio/sca - clone this project: ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20221130_can-i_microbenchmark ``` diff --git a/docs/20221215 - kine locality test.md b/docs/20221215 - kine locality test.md index 5a22022..f3cf05f 100644 --- a/docs/20221215 - kine locality test.md +++ b/docs/20221215 - kine locality test.md @@ -29,8 +29,8 @@ No significant difference in list performance of small ConfigMaps, up to 256K of ## Full configuration details -All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221215_kine_locality_test](https://github.com/moio/scalability-tests/tree/20221215_kine_locality_test/terraform) branch. -Benchmark Python script is available in the [util](https://github.com/moio/scalability-tests/tree/20221215_kine_locality_test/util) directory. +All infrastructure is defined via [Terraform](https://www.terraform.io/) files in the [20221215_kine_locality_test](https://github.com/rancher/dartboard/tree/20221215_kine_locality_test/terraform) branch. +Benchmark Python script is available in the [util](https://github.com/rancher/dartboard/tree/20221215_kine_locality_test/util) directory. ## Reproduction Instructions @@ -50,7 +50,7 @@ Benchmark Python script is available in the [util](https://github.com/moio/scala - clone this project: ```shell -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20221215_kine_locality_test ``` diff --git a/docs/20230306 - steve vai tests.md b/docs/20230306 - steve vai tests.md index fbe8b4c..9bdcc8b 100644 --- a/docs/20230306 - steve vai tests.md +++ b/docs/20230306 - steve vai tests.md @@ -37,9 +37,9 @@ Under test conditions, according to collected measures described below: ## Full configuration details -All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20230306_steve_vai_tests](https://github.com/moio/scalability-tests/tree/20230306_steve_vai_tests/terraform) branch. +All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20230306_steve_vai_tests](https://github.com/rancher/dartboard/tree/20230306_steve_vai_tests/terraform) branch. -[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/moio/scalability-tests/tree/20230306_steve_vai_tests/k6) directory. +[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/rancher/dartboard/tree/20230306_steve_vai_tests/k6) directory. ## Reproduction Instructions @@ -57,7 +57,7 @@ All infrastructure is defined in [Terraform](https://www.terraform.io/) files in Deploy the k3d infrastructure and install Rancher: ```shell # clone this project -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20230306_steve_vai_tests @@ -155,7 +155,7 @@ In the example above, retrieving a page up to 100 resources from the local clust #### Analysis of results -Full results are available in the [20230306 - steve vai test results](https://github.com/moio/scalability-tests/tree/20230306_steve_vai_tests/docs/20230306%20-%20steve%20vai%20test%20results) directory, summary is in the table below: +Full results are available in the [20230306 - steve vai test results](https://github.com/rancher/dartboard/tree/20230306_steve_vai_tests/docs/20230306%20-%20steve%20vai%20test%20results) directory, summary is in the table below: ![table showing a summary of results](images/20230306-table.png) An [Excel file](https://mysuse-my.sharepoint.com/:x:/g/personal/moio_suse_com/ERaeDyfE25xLoQFKiMYa8bgBOb2z24wKNhTp0FVMVumDMA?e=nGOPMy) is available for SUSE employees. diff --git a/docs/20230503 - steve vai tests higher scale.md b/docs/20230503 - steve vai tests higher scale.md index fdb0132..9043ca0 100644 --- a/docs/20230503 - steve vai tests higher scale.md +++ b/docs/20230503 - steve vai tests higher scale.md @@ -39,9 +39,9 @@ Under test conditions, according to collected measures described below: ## Full configuration details -All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20230503_steve_vai_tests_higher_scale](https://github.com/moio/scalability-tests/tree/20230503_steve_vai_tests_higher_scale/terraform) branch. +All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20230503_steve_vai_tests_higher_scale](https://github.com/rancher/dartboard/tree/20230503_steve_vai_tests_higher_scale/terraform) branch. -[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/moio/scalability-tests/tree/20230503_steve_vai_tests_higher_scale/k6) directory. +[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/rancher/dartboard/tree/20230503_steve_vai_tests_higher_scale/k6) directory. ## Reproduction Instructions @@ -86,7 +86,7 @@ TAG=vai make quickbuild Deploy the k3d infrastructure, install Rancher, set up clusters for tests, import built images: ```shell # clone this project -git clone https://github.com/moio/scalability-tests.git +git clone https://github.com/rancher/dartboard.git cd scalability-tests git checkout 20230503_steve_vai_tests_higher_scale @@ -108,7 +108,7 @@ First, we create a given number of ConfigMaps in a test namespace via a k6 scrip Then, we simulate 10 virtual users listing all ConfigMaps in that namespace via another k6 script. Each user will repeat the listing 30 times (for statistical accuracy of measures). The page size is of 100, like in the current UI. We exercise both the k8s based pagination implementation, using the `limit`/`continue` parameters and currently used by the [dashboard](https://github.com/rancher/dashboard/) UI, as well as the new Steve-cache pagination implementation using the `page`/`pagesize` parameters. We test both local and downstream cluster. Tests are repeated for `baseline` and `vai` images. -Details on tests are available in the [bin/run_test.js](https://github.com/moio/scalability-tests/blob/20230503_steve_vai_tests_higher_scale/bin/run_tests.mjs) script source file. +Details on tests are available in the [bin/run_test.js](https://github.com/rancher/dartboard/blob/20230503_steve_vai_tests_higher_scale/bin/run_tests.mjs) script source file. #### Procedure @@ -133,7 +133,7 @@ Interpreting results: the script will output one `results.csv` file with the fol * `p(99)` 99th percentile - 99% of requests had a duration less than or equal to this value * `count` total number of requests -Full results are available in the [results.csv](https://github.com/moio/scalability-tests/tree/20230503_steve_vai_tests_higher_scale/docs/20230503%20-%20steve%20vai%20test%20higher%20scale%20results/results.csv) file, summary is in the table below: +Full results are available in the [results.csv](https://github.com/rancher/dartboard/tree/20230503_steve_vai_tests_higher_scale/docs/20230503%20-%20steve%20vai%20test%20higher%20scale%20results/results.csv) file, summary is in the table below: ![table showing a summary of results](images/20230503-table.png) An [Excel file](https://mysuse-my.sharepoint.com/:x:/g/personal/moio_suse_com/ETkus1LxojlBm7aYWdswNX0BmmkfrQt0NET3oO6QujnNgw?e=bexG44) is available for SUSE employees. diff --git a/docs/20231222 - RKE2 100 clusters 1vs3 nodes comparison.md b/docs/20231222 - RKE2 100 clusters 1vs3 nodes comparison.md index 6467927..31c43b0 100644 --- a/docs/20231222 - RKE2 100 clusters 1vs3 nodes comparison.md +++ b/docs/20231222 - RKE2 100 clusters 1vs3 nodes comparison.md @@ -75,7 +75,7 @@ The increase in the CPU usage to almost 5 cores in _fig.1_ and 7 cores in _fig.2 Note that, in absence of resources (users, roles, projects, ConfigMaps, Secrets) the load is minimal and the CPU load varies from ~0.750 to ~0.950 cores. In those conditions, base OS processes running on the nodes have a higher CPU footprint. -You can find more screenshots of the resource usage of these two scenarios in the [100 clusters x 1 node grafana screenshots folder](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots/RKE4x) and in the [100 clusters x 3 nodes grafana screenshots folder](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots/RKE5x). +You can find more screenshots of the resource usage of these two scenarios in the [100 clusters x 1 node grafana screenshots folder](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots/RKE4x) and in the [100 clusters x 3 nodes grafana screenshots folder](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots/RKE5x). ### Rancher API response time benchmarks @@ -104,9 +104,9 @@ For more results, check the available data shared in the [available data section ## Full configuration details -All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20231222_rke2_100_clusters_1vs3_nodes_comparison](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/terraform) branch. +All infrastructure is defined in [Terraform](https://www.terraform.io/) files in the [20231222_rke2_100_clusters_1vs3_nodes_comparison](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/terraform) branch. -[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/k6) directory. +[k6](https://k6.io) load test scripts are defined in the [k6](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/k6) directory. ## Reproduction Instructions @@ -128,7 +128,7 @@ Log into Azure via the CLI: Deploy the RKE2 environment, install Rancher, set up clusters for tests: ```shell # clone this project -git clone -b 20231222_rke2_100_clusters_1vs3_nodes_comparison https://github.com/moio/scalability-tests.git +git clone -b 20231222_rke2_100_clusters_1vs3_nodes_comparison https://github.com/rancher/dartboard.git cd scalability-tests export TERRAFORM_WORK_DIR=terraform/main/azure @@ -137,7 +137,7 @@ export TERRAFORM_WORK_DIR=terraform/main/azure ./bin/setup.mjs && ./bin/run_tests.mjs ```` >[!NOTE] ->by default the branch will setup the 100 clusters x 1 node scenario: if you want to run the 100 clusters x 3 nodes one you may want to change the server_count value in the [azure configuration file](https://github.com/moio/scalability-tests/blob/20231222_rke2_100_clusters_1vs3_nodes_comparison/terraform/main/azure/inputs.tf#L28) to 3 before running the /bin/setup.mjs && ./bin/run_tests.mjs command. +>by default the branch will setup the 100 clusters x 1 node scenario: if you want to run the 100 clusters x 3 nodes one you may want to change the server_count value in the [azure configuration file](https://github.com/rancher/dartboard/blob/20231222_rke2_100_clusters_1vs3_nodes_comparison/terraform/main/azure/inputs.tf#L28) to 3 before running the /bin/setup.mjs && ./bin/run_tests.mjs command. Once the system is provisioned, to get Rancher and clusters access info: @@ -175,9 +175,9 @@ important output data points are: ## Available data -All the data collected from the tests is shared in the [_RKE2 100 clusters 1vs3 nodes comparison_ results directory](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/). +All the data collected from the tests is shared in the [_RKE2 100 clusters 1vs3 nodes comparison_ results directory](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/). There you will find: -* a list of the deployed scenarios ([_list.txt_ file](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/list.txt)) -* the raw data from the single tests ([_tests_raw_data_ folder](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/tests_raw_data)) -* screenshots from a Grafana dashboard showing CPU and Memory usage from the tested scenarios ([_grafana-screenshots_ folder](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots)) -* an [OpenDocument spreadsheet](https://github.com/moio/scalability-tests/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/scalability_tests_summary.ods) containing all the results with few graphs +* a list of the deployed scenarios ([_list.txt_ file](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/list.txt)) +* the raw data from the single tests ([_tests_raw_data_ folder](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/tests_raw_data)) +* screenshots from a Grafana dashboard showing CPU and Memory usage from the tested scenarios ([_grafana-screenshots_ folder](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/grafana-screenshots)) +* an [OpenDocument spreadsheet](https://github.com/rancher/dartboard/tree/20231222_rke2_100_clusters_1vs3_nodes_comparison/docs/20231222%20-%20RKE2%20100%20clusters%201vs3%20nodes%20comparison/scalability_tests_summary.ods) containing all the results with few graphs From 79e3ec9d1a21823c3a156dbec858a3324f4a692d Mon Sep 17 00:00:00 2001 From: Silvio Moioli Date: Tue, 28 Jan 2025 09:31:42 +0100 Subject: [PATCH 10/13] improve log line with copypastable command Signed-off-by: Silvio Moioli --- internal/kubectl/kubectl.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/kubectl/kubectl.go b/internal/kubectl/kubectl.go index 31eb007..579df3c 100644 --- a/internal/kubectl/kubectl.go +++ b/internal/kubectl/kubectl.go @@ -162,7 +162,7 @@ func K6run(kubeconfig, testPath string, envVars, tags map[string]string, printLo quotedArgs = append(quotedArgs, "-e", shellescape.Quote(fmt.Sprintf("%s=%s", k, v))) } quotedArgs = append(quotedArgs, shellescape.Quote(testPath)) - log.Printf("Running equivalent of:\nk6 %s\n", strings.Join(quotedArgs, " ")) + log.Printf("Running equivalent of:\n./bin/k6 %s\n", strings.Join(quotedArgs, " ")) // if a kubeconfig is specified, upload it as secret to later mount it if path, ok := envVars["KUBECONFIG"]; ok { From 27cf672359269050b088488792d3f8f0c8e5d9c8 Mon Sep 17 00:00:00 2001 From: Silvio Moioli Date: Tue, 28 Jan 2025 09:32:26 +0100 Subject: [PATCH 11/13] bugfix: k3d: fix outputs to correct values to unbreak dartboard load Signed-off-by: Silvio Moioli --- tofu/modules/k3d/k3s/outputs.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tofu/modules/k3d/k3s/outputs.tf b/tofu/modules/k3d/k3s/outputs.tf index 565ef1d..936fe68 100644 --- a/tofu/modules/k3d/k3s/outputs.tf +++ b/tofu/modules/k3d/k3s/outputs.tf @@ -1,7 +1,7 @@ output "config" { value = { kubeconfig = var.server_count > 0 ? abspath(local_file.kubeconfig[0].filename) : null - context = var.name + context = "k3d-${local.k3d_cluster_name}" name = local.k3d_cluster_name // addresses of the Kubernetes API server @@ -9,7 +9,7 @@ output "config" { // resolvable over the Internet public = null // resolvable from the network this cluster runs in - private = "k3d-${var.project_name}-${var.name}-server-0" + private = "https://k3d-${var.project_name}-${var.name}-server-0:6443" // resolvable from the host running OpenTofu tunnel = local.local_kubernetes_api_url } From 31829d87734478997c3b62c347881e8eea7d8982 Mon Sep 17 00:00:00 2001 From: Alejandro Ruiz Date: Fri, 31 Jan 2025 12:10:51 +0100 Subject: [PATCH 12/13] Upgrade terraform-provider-k3d to support newer versions of k3s --- tofu/main/k3d/terraform.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tofu/main/k3d/terraform.tf b/tofu/main/k3d/terraform.tf index 0f41f12..1a83190 100644 --- a/tofu/main/k3d/terraform.tf +++ b/tofu/main/k3d/terraform.tf @@ -7,7 +7,7 @@ terraform { } k3d = { source = "moio/k3d" - version = "0.0.10" + version = "0.0.11" } } } From 3f7ed00446510c078870e7e83490d932e4bfe081 Mon Sep 17 00:00:00 2001 From: Alejandro Ruiz Date: Fri, 31 Jan 2025 15:40:17 +0100 Subject: [PATCH 13/13] Fix networking problems after upgrading K3D --- tofu/main/k3d/main.tf | 6 ++++++ tofu/main/k3d/terraform.tf | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tofu/main/k3d/main.tf b/tofu/main/k3d/main.tf index b4db774..db6b077 100644 --- a/tofu/main/k3d/main.tf +++ b/tofu/main/k3d/main.tf @@ -1,3 +1,9 @@ +provider "k3d" { + fixes = { + "dns" = false + } +} + module "network" { source = "../../modules/k3d/network" project_name = var.project_name diff --git a/tofu/main/k3d/terraform.tf b/tofu/main/k3d/terraform.tf index 1a83190..45eac5c 100644 --- a/tofu/main/k3d/terraform.tf +++ b/tofu/main/k3d/terraform.tf @@ -7,7 +7,7 @@ terraform { } k3d = { source = "moio/k3d" - version = "0.0.11" + version = "0.0.12" } } }