Require KOPS_TERRAFORM_0_12_RENAMED, to guard against tf breakage

Terraform 0.12 broke compatability and this could result in data loss
if we deleted the etcd volumes.  Document the procedure to fix the
tfstate, and require an env var to be passed to avoid accidental data
loss.
This commit is contained in:
Justin SB 2021-01-18 17:46:11 -05:00
parent 470ff01ec0
commit ce036d397d
4 changed files with 119 additions and 6 deletions

View File

@ -0,0 +1,73 @@
# Terraform 0.12 Naming Compatability
Terraform 0.12 introduced new restrictions on naming, breaking
compatability with earlier terraform versions when resource names
start with a number. Single-zone etcd clusters (and possibly some
other scenarios) would generate terraform names for EBS volumes that
start with a number, which are no longer permitted.
For new clusters, kOps now avoids this problem. But for existing
clusters, in order for terraform not to erase your data, a manual
state migration is needed first.
In order to prevent against data-loss, kOps will detect the problem
and require you to pass an environment variable to indicate that you
have performed the migration.
NOTE: You must perform this migration with terraform 0.11.
To do this state migration, first run `terraform state list`.
You should see something like this, depending on how many
control-plane nodes you have:
```
...
aws_ebs_volume.1-etcd-events-foo-example-com
aws_ebs_volume.1-etcd-main-foo-example-com
aws_ebs_volume.2-etcd-events-foo-example-com
aws_ebs_volume.2-etcd-main-foo-example-com
aws_ebs_volume.3-etcd-events-foo-example-com
aws_ebs_volume.3-etcd-main-foo-example-com
...
```
We want to prefix each of those names with `ebs-`.
A one liner to do so is:
```
terraform-0.11 state list | grep aws_ebs_volume | cut -d. -f2 | xargs -I {} terraform-0.11 state mv aws_ebs_volume.{} aws_ebs_volume.ebs-{}
```
This is equivalent to the manual form:
```
terraform-0.11 state mv aws_ebs_volume.1-etcd-events-foo-example-com aws_ebs_volume.ebs-1-etcd-events-foo-example-com
terraform-0.11 state mv aws_ebs_volume.1-etcd-main-foo-example-com aws_ebs_volume.ebs-1-main-events-foo-example-com
terraform-0.11 state mv aws_ebs_volume.2-etcd-events-foo-example-com aws_ebs_volume.ebs-2-etcd-events-foo-example-com
terraform-0.11 state mv aws_ebs_volume.2-etcd-main-foo-example-com aws_ebs_volume.ebs-2-etcd-main-foo-example-com
terraform-0.11 state mv aws_ebs_volume.3-etcd-events-foo-example-com aws_ebs_volume.ebs-3-etcd-events-foo-example-com
terraform-0.11 state mv aws_ebs_volume.3-etcd-main-foo-example-com aws_ebs_volume.ebs-3-etcd-main-foo-example-com
```
Finally, you should repeat the kops update command passing
`KOPS_TERRAFORM_0_12_RENAMED=ebs`.
Note that you must then run `terraform init` / `terraform plan` /
`terraform apply` using terraform 0.12.26, not terraform 0.13.
Carefully review the output of `terraform plan` / `terraform apply` to
ensure that the EBS volumes are not being deleted & recreated. Note
that `aws_security_group_rule` will be deleted and recreated, due to
the same terraform naming restriction.
If you encounter the "A duplicate Security Group rule..." error, you
will likely have to run `terraform apply` twice, because of the
terraform bug described in
`https://github.com/hashicorp/terraform/pull/2376`
Note that you must _always_ pass `KOPS_TERRAFORM_0_12_RENAMED=ebs` to
`kops` for these clusters, as kOps otherwise has no way to know that
the rename has been done. However, kOps will "fail safe" and simply
refuse to generate terraform in these cases.

View File

@ -18,6 +18,7 @@ package awstasks
import (
"fmt"
"os"
"k8s.io/kops/upup/pkg/fi"
"k8s.io/kops/upup/pkg/fi/cloudup/awsup"
@ -244,18 +245,43 @@ func (_ *EBSVolume) RenderTerraform(t *terraform.TerraformTarget, a, e, changes
Tags: e.Tags,
}
return t.RenderResource("aws_ebs_volume", e.TerraformName(), tf)
tfName, _ := e.TerraformName()
return t.RenderResource("aws_ebs_volume", tfName, tf)
}
func (e *EBSVolume) TerraformLink() *terraform.Literal {
return terraform.LiteralSelfLink("aws_ebs_volume", e.TerraformName())
tfName, _ := e.TerraformName()
return terraform.LiteralSelfLink("aws_ebs_volume", tfName)
}
func (e *EBSVolume) TerraformName() string {
if (*e.Name)[0] >= '0' && (*e.Name)[0] <= '9' {
return fmt.Sprintf("ebs-%v", *e.Name)
// TerraformName returns the terraform-safe name, along with a boolean indicating of whether name-prefixing was needed.
func (e *EBSVolume) TerraformName() (string, bool) {
usedPrefix := false
name := fi.StringValue(e.Name)
if name[0] >= '0' && name[0] <= '9' {
usedPrefix = true
return fmt.Sprintf("ebs-%v", name), usedPrefix
}
return *e.Name
return name, usedPrefix
}
// PreRun is run before general task execution, and checks for terraform breaking changes.
func (e *EBSVolume) PreRun(c *fi.Context) error {
if _, ok := c.Target.(*terraform.TerraformTarget); ok {
_, usedPrefix := e.TerraformName()
if usedPrefix {
if os.Getenv("KOPS_TERRAFORM_0_12_RENAMED") == "" {
fmt.Fprintf(os.Stderr, "Terraform 0.12 broke compatability and disallowed names that begin with a number.\n")
fmt.Fprintf(os.Stderr, " To move an existing cluster to the new syntax, you must first move existing volumes to the new names.\n")
fmt.Fprintf(os.Stderr, " To indicate that you have already performed the rename, pass KOPS_TERRAFORM_0_12_RENAMED=ebs environment variable.\n")
fmt.Fprintf(os.Stderr, " Not doing so will result in data loss.\n")
fmt.Fprintf(os.Stderr, "For detailed instructions: https://github.com/kubernetes/kops/blob/master/permalinks/terraform_renamed.md\n")
return fmt.Errorf("must update terraform state for 0.12, and then pass KOPS_TERRAFORM_0_12_RENAMED=ebs")
}
}
}
return nil
}
type cloudformationVolume struct {

View File

@ -55,6 +55,14 @@ func (o *RunTasksOptions) InitDefaults() {
func (e *executor) RunTasks(taskMap map[string]Task) error {
dependencies := FindTaskDependencies(taskMap)
for _, task := range taskMap {
if taskPreRun, ok := task.(TaskPreRun); ok {
if err := taskPreRun.PreRun(e.context); err != nil {
return err
}
}
}
taskStates := make(map[string]*taskState)
for k, task := range taskMap {

View File

@ -28,6 +28,12 @@ type Task interface {
Run(*Context) error
}
// TaskPreRun is implemented by tasks that perform some initial validation.
type TaskPreRun interface {
// PreRun will be run for all TaskPreRuns, before any Run functions are invoked.
PreRun(*Context) error
}
// TaskAsString renders the task for debug output
// TODO: Use reflection to make this cleaner: don't recurse into tasks - print their names instead
// also print resources in a cleaner way (use the resource source information?)